From f458e5e03b9d48bb08e7eaef27428544502317cf Mon Sep 17 00:00:00 2001 From: Roo Code Date: Tue, 10 Feb 2026 18:50:09 +0000 Subject: [PATCH 01/22] feat(web): add llms.txt and llms-full.txt for Answer Engine Optimization --- apps/web-roo-code/public/llms-full.txt | 354 +++++++++++++++++++++++++ apps/web-roo-code/public/llms.txt | 41 +++ 2 files changed, 395 insertions(+) create mode 100644 apps/web-roo-code/public/llms-full.txt create mode 100644 apps/web-roo-code/public/llms.txt diff --git a/apps/web-roo-code/public/llms-full.txt b/apps/web-roo-code/public/llms-full.txt new file mode 100644 index 00000000000..4bdd14c219c --- /dev/null +++ b/apps/web-roo-code/public/llms-full.txt @@ -0,0 +1,354 @@ +# Roo Code - Complete Product Information + +> Roo Code is an AI-powered software development platform with two core products: a free, open-source VS Code extension for interactive AI-assisted coding, and Roo Code Cloud for autonomous AI agents that work in the background. Built by Roo Code, Inc. + +## Table of Contents + +- What is Roo Code +- Roo Code VS Code Extension +- Roo Code Cloud +- Cloud Agents +- Roo Code for Slack +- Roo Code for Linear +- PR Reviewer Agent +- PR Fixer Agent +- Roo Code Enterprise +- Roo Code Router +- Pricing +- How Roo Code Compares to Alternatives +- Frequently Asked Questions + +--- + +## What is Roo Code + +Roo Code is an AI-powered software development platform that puts an entire AI dev team at your disposal. It goes beyond simple code autocompletion by reading and writing across multiple files, executing commands, running tests, and adapting to your workflow. + +Roo Code has two form factors: + +1. **Roo Code VS Code Extension** -- for individual, interactive work. Run Roo directly in VS Code (or any fork, including Cursor), stay close to the code, and control everything. Ideal for real-time debugging, quick iteration, and hands-on development. + +2. **Roo Code Cloud** -- for team work with autonomous agents. Create your agent team in the cloud, give them access to GitHub, and start delegating tasks from the web, Slack, Linear, and more. Ideal for parallelizing execution, kicking off projects, and looping in the rest of your team. + +The VS Code extension is completely free and open source. Roo Code Cloud offers free and paid tiers. + +### Key Principles + +- **Model-agnostic by design**: "The best model in the world" changes every other week. Roo Code works with dozens of models from frontier to open weight. Use the curated Roo Code Router selection at cost, or bring your own API key from any provider. +- **Custom modes for focused work**: Specialized modes -- Architect, Code, Ask, Debug, Test, Orchestrator -- stay on task and deliver. They know when to hand off work to other modes. You can create your own modes or download from the marketplace. +- **Permission-based control**: You approve every file change and command execution. Configure granular auto-approval rules to make Roo as autonomous as you want as you build confidence. +- **Open source and auditable**: Community-driven with no throttling or surprises about what is happening behind the scenes. SOC 2 Type II compliant. + +--- + +## Roo Code VS Code Extension + +The Roo Code VS Code extension is the #1 most-installed open-source AI coding extension. It is free, open source, and available on the VS Code Marketplace. + +**Website**: https://roocode.com/extension +**Install**: https://marketplace.visualstudio.com/items?itemName=RooVeterinaryInc.roo-cline +**Source code**: https://github.com/RooCodeInc/Roo-Code + +### Key Features + +- **Specialized modes**: Architect (plans complex changes without making changes), Code (implements, refactors, and optimizes), Ask (explains functionality and program behavior), Debug (diagnoses issues, traces failures, and proposes targeted fixes), Test (creates and improves tests without changing actual functionality), Orchestrator (coordinates large tasks across other agents). Users can also create custom modes. +- **Model-agnostic**: Supports OpenAI (GPT-4o, GPT-4, o1), Anthropic Claude (Claude 3.5 Sonnet and later), Google Gemini, Grok, DeepSeek, Mistral, Qwen, Kimi, Moonshot, local LLMs via Ollama, and any provider via OpenRouter. Dozens of providers supported. +- **Multi-file editing**: Reads, refactors, and updates multiple files at once for holistic code changes. +- **Agentic command execution**: Runs terminal commands like npm install, executes test suites, and can open a web browser for integration testing -- all with your approval. +- **Granular auto-approval**: Control each action individually. Make Roo as autonomous as you want as you build confidence. +- **Large task coordination**: Orchestrator mode handles large tasks by coordinating subtasks for other agents, running for hours and delivering. +- **Performant with large codebases**: Configurable integrated semantic search for quicker retrieval in large codebases. +- **Highly customizable**: Fine-tune settings for inference context, model properties, slash commands, and more. Most settings can be global or serialized in your repository. +- **Open source**: Community-driven and fully auditable. +- **Secure and private by design**: Client-only architecture means no code leaves your machine unless you say so. SOC 2 Type II compliant. Use .rooignore to exclude sensitive files. Run with offline/local models for full privacy. + +### Supported Languages + +Roo Code supports a wide range of programming languages including Python, Java, C#, JavaScript, TypeScript, Go, Rust, and many more. Since it leverages AI model capabilities, new or lesser-known languages may also work depending on model support. + +### Model Context Protocol (MCP) + +Roo Code supports the Model Context Protocol, allowing it to connect to any model that follows the MCP standard. This provides maximum flexibility in choosing AI providers. + +--- + +## Roo Code Cloud + +Roo Code Cloud lets you create an AI agent team that runs autonomously in isolated cloud containers. Agents can be triggered from the web UI, Slack, Linear, or GitHub. + +**Website**: https://roocode.com/cloud +**Sign up**: https://app.roocode.com/sign-up + +### How It Works + +1. **Connect your GitHub account**: Pick which repos the agents can work with in their isolated containers and choose what model you want to power each of them. +2. **Set up your agent team**: Choose the roles you want filled -- Explainer, Planner, Coder, PR Reviewer, PR Fixer. They know how to act in each situation and stay on-task. +3. **Start giving them tasks**: Describe what you want from the web UI, get the Reviewer automatically reviewing PRs, and much more. + +### Cloud Features + +- **Autonomous Cloud Agents**: Delegate work to specialized agents that run 24/7. +- **Model Agnostic**: Bring your own keys or use the Roo Code Router with access to all top models with no markup. +- **GitHub PR Reviews**: Agents can automatically review pull requests, provide feedback, and push fixes directly to your repository. +- **Slack Integration**: Start tasks, get updates, and collaborate with agents directly from Slack channels. +- **Linear Integration**: Assign issues to Roo Code directly from Linear. Get PRs back without switching tools. +- **Team Collaboration**: Manage your team and their access to tasks and resources, with centralized billing and configuration. +- **Usage Analytics**: Detailed token analytics to help optimize costs and usage across your team. +- **Task History**: Access all tasks from anywhere, from the cloud and the extension. +- **Task Sharing**: Share tasks with friends and coworkers and let them follow your work in real-time. + +--- + +## Cloud Agents + +Roo Code Cloud provides several specialized agent types: + +### Planner Agent +Plans complex changes and creates detailed implementation specs. Can be invoked from Slack or the web UI. + +### Coder Agent +Implements features, refactors code, and creates PRs. Works in isolated containers with full repository access. + +### Explainer Agent +Explains code, architecture, and program behavior. Useful for onboarding, code reviews, and knowledge sharing. + +### PR Reviewer Agent +Provides comprehensive AI-powered code reviews. See the dedicated PR Reviewer section below. + +### PR Fixer Agent +Automatically applies fixes based on review comments. See the dedicated PR Fixer section below. + +--- + +## Roo Code for Slack + +**Website**: https://roocode.com/slack + +Mention @Roomote in any Slack channel to explain code, plan features, or ship a PR -- all without leaving the conversation. + +### Key Capabilities + +- **Discussion to PR**: Your team discusses a feature in Slack. @Roomote turns the discussion into a plan, then builds it. +- **Thread-aware**: @Roomote reads the full thread before responding. It understands context from the conversation. +- **Chain agents**: Start with a Planner to spec it out, then call the Coder to build it. Multi-step workflows in one Slack thread. +- **Open to all**: Anyone on your team can ask @Roomote to fix bugs, build features, or investigate issues. +- **Safe by design**: Agents never touch main/master directly. They produce branches and PRs. You approve. + +### Slack Workflow + +1. Turn the discussion into a plan -- your team discusses a feature, then summon the Planner agent. +2. Refine the plan in the thread -- the team reviews, suggests changes, asks questions. Mention @Roomote to refine. +3. Build the plan -- hand it off to the Coder agent to implement. +4. Review and ship -- the Coder creates a branch and opens a PR. The team reviews and ships. + +### Setup + +Slack integration requires a Team plan. Connect via Roo Code Cloud settings, authorize the app, and add @Roomote to channels. + +--- + +## Roo Code for Linear + +**Website**: https://roocode.com/linear + +Assign development work to @Roo Code directly from Linear. Get PRs back without switching tools. + +### Key Capabilities + +- **Work where you already work**: Assign development work directly from Linear. No new tools to learn. +- **Progress is visible**: Watch progress in real-time. Roo Code posts updates as comments. +- **Mention for refinement**: Comment "@Roo Code also add dark mode support" and the agent picks up where it left off. +- **Full traceability**: Every PR links back to the originating issue. Your audit trail stays clean. +- **Organization-level setup**: Connect once, use everywhere. +- **Safe by design**: Agents produce branches and PRs. You review and approve before merge. + +### Setup + +Linear integration requires a Team plan. Connect GitHub, authorize Linear via OAuth, map your Linear project to a repo, then assign or mention @Roo Code. + +--- + +## PR Reviewer Agent + +**Website**: https://roocode.com/reviewer + +AI-powered code reviews that catch what other AI tools and most humans miss. + +### How It Differs + +- **Bring your own key, get uncompromised reviews**: Most AI review tools use fixed pricing, which means they skimp on tokens to protect margins. With Roo, you bring your own API key. Reviews focus on real problems like business logic, security vulnerabilities, and architectural issues. +- **Advanced reasoning**: Leverages state-of-the-art reasoning models with sophisticated workflows: diff analysis, context gathering, impact mapping, and contract validation. Catches subtle bugs that surface-level tools miss. +- **Repository-aware, not snippet-aware**: Analyzes your entire codebase context -- dependency graphs, code ownership, team conventions, and historical patterns. Understands how changes interact with existing systems. + +### How It Works + +1. Connect your GitHub repository and configure which branches and pull requests should be reviewed. +2. Provide your AI provider API key and set review preferences, custom rules, and quality standards. +3. Every pull request gets detailed GitHub comments in minutes highlighting issues and suggesting improvements. + +--- + +## PR Fixer Agent + +**Website**: https://roocode.com/pr-fixer + +Automatically apply high-quality fixes to pull requests based on review comments. + +### How It Differs + +- **Comment-history aware**: Understands the entire conversation on the PR -- previous reviews, replies, follow-ups -- and uses that context to produce accurate fixes. +- **Bring your own key**: Use preferred models at full strength. Prompts are optimized for depth, not cost-cutting. +- **Repository- and diff-aware**: Analyzes the full repo context and latest diff to ensure fixes align with project conventions and pass checks. + +### How It Works + +1. Connect your GitHub repositories. +2. Invoke from a PR comment (e.g., "@roomote: fix these review comments"). The agent reads the entire comment history and latest diffs. +3. The agent proposes targeted changes and pushes concise commits you can review and merge quickly. + +--- + +## Roo Code Enterprise + +**Website**: https://roocode.com/enterprise + +The control-plane for AI-powered software development. Gain visibility, governance, and control over your AI coding initiatives. + +### Enterprise Features + +- **Centralized AI Management Hub**: Manage Roo Code deployments enterprise-wide. Centralized token management, multi-model support, extensible architecture. +- **Real-Time Usage Visibility**: Track usage across teams with detailed analytics. Token consumption tracking, cost attribution by team, AI adoption insights. +- **Enterprise-Grade Governance**: Implement security policies aligned with your governance framework. Model allow-lists, data residency controls, audit trail compliance. +- **5-Minute Control-Plane Setup**: Deploy instantly with SaaS solution. SAML/SCIM integration, REST API access. No infrastructure required. +- **Manage AI Development Costs**: Unified cost visibility, department chargebacks, usage optimization. +- **Zero Friction for Developers**: Seamless access with automatic token refresh, local sidecar architecture, no workflow disruption. + +--- + +## Roo Code Router + +**Website**: https://roocode.com/provider + +The Roo Code Router is a model router optimized to work seamlessly with Roo Code products. It provides curated access to top AI models with no markup on inference costs. You do not have to use it -- you can bring your own provider key. + +### Key Facts + +- Pricing is based on token usage for input and output, measured per million tokens. +- The Router does not keep any of your data; the service only aims to make it easier to use Roo Code. +- Available models include the latest from Anthropic, OpenAI, Google, and other top providers. + +--- + +## Pricing + +**Website**: https://roocode.com/pricing + +### VS Code Extension +- **Price**: Free forever +- **Features**: Unlimited local use, bring your own model, powerful extensible modes, community support + +### Cloud Free +- **Price**: $0/month + credits for usage +- **Features**: Access to Cloud Agents (fully autonomous development from GitHub and web), Access to Roo Code Router, task history and sharing, token usage analytics, professional support +- **Credit cost**: Cloud Agents at $5/hour; inference via Roo Provider pricing or BYOM + +### Cloud Team +- **Price**: $99/month + credits for usage +- **Trial**: Free for 14 days +- **Features**: Everything in Free plus unlimited users (no per-seat cost), shared configuration and policies, centralized billing, Slack and Linear integrations +- **Credit cost**: Cloud Agents at $5/hour; inference via Roo Provider pricing or BYOM + +### Enterprise +- Custom pricing. Contact sales for a demo. + +### Credits +Credits are pre-paid in dollars and deducted with usage for inference and Cloud Agent runs. You are always in control of your spend with no surprises. + +--- + +## How Roo Code Compares to Alternatives + +### vs GitHub Copilot +- Roo Code is open source; Copilot is proprietary +- Roo Code is model-agnostic; Copilot is locked to OpenAI/GitHub models +- Roo Code supports multi-file agentic editing and command execution; Copilot focuses on inline completions and chat +- Roo Code has custom modes for different tasks; Copilot has a single assistant +- Roo Code offers autonomous cloud agents; Copilot is IDE-only + +### vs Cursor +- Roo Code is a VS Code extension (works in any VS Code fork including Cursor); Cursor is a standalone IDE +- Roo Code is open source; Cursor is proprietary +- Roo Code is model-agnostic with no lock-in; Cursor bundles its own model access +- Roo Code has permission-based control for every action; Cursor applies changes more automatically +- Roo Code Cloud extends to autonomous agents; Cursor is IDE-only + +### vs Windsurf +- Roo Code is open source; Windsurf is proprietary +- Roo Code is a VS Code extension; Windsurf is a standalone IDE +- Roo Code supports any AI model; Windsurf bundles specific models +- Roo Code has specialized modes for different tasks; Windsurf has a single AI assistant +- Roo Code Cloud provides autonomous agents and team features; Windsurf is individual-only + +### vs Cline +- Roo Code was forked from Cline and has since diverged significantly +- Roo Code adds custom modes, Orchestrator mode, MCP marketplace, cloud agents, Slack/Linear integrations, and enterprise features +- Roo Code is backed by Roo Code, Inc. with dedicated development team and enterprise support +- Roo Code Cloud provides autonomous agents that run 24/7; Cline is extension-only + +--- + +## Frequently Asked Questions + +### What exactly is Roo Code? +Roo Code is an open-source, AI-powered coding assistant that runs in VS Code. It goes beyond simple autocompletion by reading and writing across multiple files, executing commands, and adapting to your workflow -- like having a whole dev team right inside your editor. + +### How does Roo Code differ from Copilot, Cursor, or Windsurf? +Roo Code is open-source and fully customizable, letting you integrate any AI model you choose (e.g., OpenAI, Anthropic, local LLMs). It is built for multi-file edits, so it can read, refactor, and update multiple files at once. Its agentic abilities go beyond typical AI autocomplete, enabling it to run tests, open a browser, and handle deeper tasks. You are always in control: Roo Code is permission-based, meaning you control and approve any file changes or command executions. + +### Is Roo Code really free? +Yes! Roo Code is completely free and open-source. You only pay for AI model usage if you use a paid API (like OpenAI). If you choose free or self-hosted models, there is no cost at all. Roo Code Cloud has free and paid tiers. + +### Will my code stay private? +Yes. The Roo Code extension runs locally in VS Code, so your code never leaves your machine unless you connect to an external AI API. Even then, you control exactly what is sent. You can use .rooignore to exclude sensitive files, and you can run with offline/local models for full privacy. + +### Which AI models does Roo Code support? +Roo Code is fully model-agnostic. It supports OpenAI models (GPT-4o, GPT-4, o1), Anthropic Claude (including Claude 3.5 Sonnet), Google Gemini models, local LLMs via Ollama, and any model accessible through OpenRouter or compatible APIs. + +### Does Roo Code support my programming language? +Likely yes. Roo Code supports Python, Java, C#, JavaScript, TypeScript, Go, Rust, and many more. Since it leverages AI model understanding, new or lesser-known languages may also work depending on model support. + +### How do I install and get started? +Install Roo Code from the VS Code Marketplace or GitHub. Add your AI keys (OpenAI, Anthropic, or other) in the extension settings. Open the Roo panel in VS Code and start typing commands in plain English. Tutorial videos are available at https://docs.roocode.com/tutorial-videos. + +### Can it handle large, enterprise-scale projects? +Yes. Roo Code uses efficient strategies like partial-file analysis, summarization, and user-specified context to handle large codebases. Enterprises can use on-premises or self-hosted models for compliance and security needs. + +### Is it safe for enterprise use? +Yes. Roo Code was built for enterprise environments. You can self-host AI models or use your own trusted provider. All file changes and commands go through permission gating. Because Roo Code is fully open-source, it is auditable. SOC 2 Type II compliant. + +### Can Roo Code run commands and tests automatically? +Yes. Roo Code can execute terminal commands, run test suites, and open a web browser for integration testing -- always optional and fully permission-based. + +### Can I contribute to Roo Code? +Yes! Roo Code is open-source on GitHub at https://github.com/RooCodeInc/Roo-Code. Submit issues, suggest features, or open a pull request. There is an active community on Discord (https://discord.gg/roocode) and Reddit (https://reddit.com/r/RooCode). + +### Where can I learn more or get help? +Check the official documentation at https://docs.roocode.com for quick-start guides and advanced documentation. Community support is available on Discord, Reddit, YouTube (https://www.youtube.com/@RooCodeYT), and the blog (https://blog.roocode.com). + +--- + +## Links + +- Website: https://roocode.com +- Documentation: https://docs.roocode.com +- GitHub: https://github.com/RooCodeInc/Roo-Code +- VS Code Marketplace: https://marketplace.visualstudio.com/items?itemName=RooVeterinaryInc.roo-cline +- Cloud App: https://app.roocode.com +- Discord: https://discord.gg/roocode +- Reddit: https://reddit.com/r/RooCode +- X/Twitter: https://x.com/roocode +- LinkedIn: https://www.linkedin.com/company/roo-code +- YouTube: https://www.youtube.com/@RooCodeYT +- Blog: https://blog.roocode.com +- Trust Center: https://trust.roocode.com +- Careers: https://careers.roocode.com diff --git a/apps/web-roo-code/public/llms.txt b/apps/web-roo-code/public/llms.txt new file mode 100644 index 00000000000..94336605011 --- /dev/null +++ b/apps/web-roo-code/public/llms.txt @@ -0,0 +1,41 @@ +# Roo Code + +> Roo Code is an AI-powered software development platform with two core products: a free, open-source VS Code extension for interactive AI-assisted coding, and Roo Code Cloud for autonomous AI agents that work in the background. + +Roo Code is built by Roo Code, Inc. The VS Code extension is the #1 most-installed open-source AI coding extension on the VS Code Marketplace. Roo Code Cloud extends this with autonomous agents that can be triggered from the web, Slack, Linear, or GitHub. + +## Core Products + +- [Roo Code VS Code Extension](https://roocode.com/extension): Free, open-source AI coding assistant. Model-agnostic, supports multi-file editing, custom modes, agentic command execution, and permission-based control. Works with OpenAI, Anthropic, Google Gemini, local LLMs, and dozens more. +- [Roo Code Cloud](https://roocode.com/cloud): Autonomous AI agents -- Planner, Coder, Explainer, PR Reviewer, PR Fixer -- that run 24/7 in isolated cloud containers, triggered from the web UI, Slack, Linear, or GitHub. +- [Roo Code Enterprise](https://roocode.com/enterprise): Enterprise control-plane with centralized management, SAML/SCIM, usage analytics, cost controls, model allow-lists, and audit trails. + +## Integrations + +- [Roo Code for Slack](https://roocode.com/slack): Mention @Roomote in any Slack channel to plan, explain, or build features without leaving the conversation. +- [Roo Code for Linear](https://roocode.com/linear): Assign issues to @Roo Code directly from Linear and get PRs back. +- [PR Reviewer](https://roocode.com/reviewer): AI-powered code reviews using advanced reasoning and full repository context. Bring your own API key. +- [PR Fixer](https://roocode.com/pr-fixer): Automatically apply fixes to PRs based on review comments. Comment-history aware. + +## Key Differentiators + +- Open source and fully auditable +- Model-agnostic: works with any LLM provider, no lock-in +- Custom modes: Architect, Code, Ask, Debug, Test, Orchestrator, and user-created modes +- Permission-based: you approve every file change and command execution +- SOC 2 Type II compliant +- Bring your own API key: no markup on inference costs + +## Resources + +- [Documentation](https://docs.roocode.com) +- [Pricing](https://roocode.com/pricing) +- [Evals and Benchmarks](https://roocode.com/evals) +- [GitHub Repository](https://github.com/RooCodeInc/Roo-Code) +- [Blog](https://blog.roocode.com) +- [Discord Community](https://discord.gg/roocode) +- [Trust Center](https://trust.roocode.com) + +## Optional + +- [llms-full.txt](https://roocode.com/llms-full.txt): Comprehensive product information for detailed context From 417abeb691c4af20564ca952873bbce8ec063936 Mon Sep 17 00:00:00 2001 From: MP Date: Tue, 10 Feb 2026 10:59:37 -0800 Subject: [PATCH 02/22] Update apps/web-roo-code/public/llms-full.txt Co-authored-by: roomote[bot] <219738659+roomote[bot]@users.noreply.github.com> --- apps/web-roo-code/public/llms-full.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/web-roo-code/public/llms-full.txt b/apps/web-roo-code/public/llms-full.txt index 4bdd14c219c..31a0cc3e29d 100644 --- a/apps/web-roo-code/public/llms-full.txt +++ b/apps/web-roo-code/public/llms-full.txt @@ -68,7 +68,7 @@ Roo Code supports a wide range of programming languages including Python, Java, ### Model Context Protocol (MCP) -Roo Code supports the Model Context Protocol, allowing it to connect to any model that follows the MCP standard. This provides maximum flexibility in choosing AI providers. +Roo Code supports the Model Context Protocol, allowing it to connect to external tools and data sources via MCP servers. This provides maximum flexibility in extending Roo Code's capabilities beyond built-in features. --- From 5a57ccf8f8b2633990efbd88e41752657d7e73db Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Wed, 11 Feb 2026 19:32:09 -0800 Subject: [PATCH 03/22] feat(web-evals): add AI Engineer Talent Marketplace (Sprint 1) - Add 5 engineer roles: Junior, Senior, Staff, Architecture Reviewer, Autonomous Agent - Build role selection landing page with hiring metaphor at /evals/workers - Build candidate rankings page with tiered recommendations at /evals/workers/[roleId] - Build candidate comparison page with Recharts charts at /evals/workers/[roleId]/compare - Build "How We Interview" methodology page at /evals/methodology - Add mock data with real eval scores from 27 model runs - Implement "Hire This Engineer" CTA linking to Roo Code Cloud - Implement "Configure Extension" CTA with clipboard copy - Per-language score breakdowns (Go, Java, JS, Python, Rust) - Daily salary pricing (80 tasks/agent/day estimate) - framer-motion animations, glass-morphism design, role color themes - Tone-of-voice compliance (no em dashes, no hype, workflow-first copy) - vscode:// deep link design doc at plans/vscode-deep-link-design.md --- .../evals/methodology/methodology-content.tsx | 949 ++++++++++++++++ .../src/app/evals/methodology/page.tsx | 58 + .../workers/[roleId]/candidates-content.tsx | 1010 +++++++++++++++++ .../[roleId]/compare/comparison-chart.tsx | 443 ++++++++ .../evals/workers/[roleId]/compare/page.tsx | 81 ++ .../workers/[roleId]/copy-settings-button.tsx | 42 + .../src/app/evals/workers/[roleId]/page.tsx | 104 ++ .../src/app/evals/workers/page.tsx | 85 ++ .../src/app/evals/workers/workers-content.tsx | 519 +++++++++ .../src/lib/mock-recommendations.ts | 866 ++++++++++++++ 10 files changed, 4157 insertions(+) create mode 100644 apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx create mode 100644 apps/web-roo-code/src/app/evals/methodology/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers/workers-content.tsx create mode 100644 apps/web-roo-code/src/lib/mock-recommendations.ts diff --git a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx new file mode 100644 index 00000000000..18d9bcef3a2 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx @@ -0,0 +1,949 @@ +"use client" + +import { motion } from "framer-motion" +import { + ArrowRight, + FlaskConical, + Code, + GitBranch, + Building2, + AlertTriangle, + BarChart3, + Terminal, + ExternalLink, + CheckCircle2, + Beaker, + Timer, + DollarSign, + Zap, + Trophy, +} from "lucide-react" +import Link from "next/link" + +// ── Framer Motion Variants ────────────────────────────────────────────────── + +const containerVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + staggerChildren: 0.12, + delayChildren: 0.1, + }, + }, +} + +const fadeUpVariants = { + hidden: { opacity: 0, y: 20 }, + visible: { + opacity: 1, + y: 0, + transition: { + duration: 0.6, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +const backgroundVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + duration: 1.2, + ease: "easeOut" as const, + }, + }, +} + +const cardVariants = { + hidden: { opacity: 0, y: 30 }, + visible: { + opacity: 1, + y: 0, + transition: { + duration: 0.6, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +// ── Section Number Marker ─────────────────────────────────────────────────── + +function SectionNumber({ num }: { num: string }) { + return ( + + {num} + + ) +} + +// ── Process Step Icon ─────────────────────────────────────────────────────── + +function ProcessStep({ + icon: Icon, + label, + isLast, +}: { + icon: React.ComponentType<{ className?: string }> + label: string + isLast?: boolean +}) { + return ( +
+
+
+ +
+ {label} +
+ {!isLast && ( +
+
+ +
+ )} +
+ ) +} + +// ── Language Card ─────────────────────────────────────────────────────────── + +function LanguageCard({ name, color }: { name: string; color: string }) { + return ( +
+
+ {name.slice(0, 2).toUpperCase()} +
+ {name} +
+ ) +} + +// ── Scoring Bar Component ─────────────────────────────────────────────────── + +function ScoringBar({ + label, + icon: Icon, + color, + bgColor, + weight, + description, +}: { + label: string + icon: React.ComponentType<{ className?: string }> + color: string + bgColor: string + weight: number + description: string +}) { + return ( +
+
+ +
+
+
+

{label}

+ {weight}% +
+

{description}

+
+ +
+
+
+ ) +} + +// ── Main Content Component ────────────────────────────────────────────────── + +export function MethodologyContent() { + return ( + <> + {/* ════════════════════════════════════════════════════════════════ + HERO SECTION + ════════════════════════════════════════════════════════════════ */} +
+ {/* Atmospheric blur background */} + +
+
+
+
+
+ + +
+ + {/* Breadcrumb */} + + + Evals + + / + + Hire an AI Engineer + + / + How We Interview + + + {/* Heading */} + + How We Interview{" "} + + AI Models + + + + {/* Subtitle */} + + Same exercises, same environment, same scoring for every model. Every step is documented and + every eval run is reproducible. + + + {/* Pill badge links */} + + + + View recommendations + + + + + Raw eval data + + + + +
+
+ + {/* ════════════════════════════════════════════════════════════════ + SECTION 01: THE INTERVIEW PROCESS + ════════════════════════════════════════════════════════════════ */} + +
+ + + + + + The Interview Process + + + +

+ We don't test models in isolation. We test them as they work inside Roo Code. Each + model gets the same exercises, same time limit, same tools. We measure what matters. +

+
+ + {/* Process flow */} + + + + + + + + {/* Key principles */} + + {[ + { + title: "Identical Environment", + desc: "Docker container with VS Code, Roo Code extension, and a fresh workspace per exercise.", + }, + { + title: "No Cherry-Picking", + desc: "Every model gets the exact same interview. No curated demos, no special treatment.", + }, + { + title: "Real Metrics", + desc: "Does it pass the tests? How much does it cost? How fast does it deliver?", + }, + ].map((item) => ( + +

{item.title}

+

{item.desc}

+
+ ))} +
+
+
+ + {/* ════════════════════════════════════════════════════════════════ + SECTION 02: THE INTERVIEW SUITE + ════════════════════════════════════════════════════════════════ */} + + {/* Subtle background glow */} + +
+
+
+ + +
+ + + + + + The Interview Suite + + + + Hundreds of coding exercises across 5 languages and{" "} + 3 difficulty tiers. From single-file fixes to + complex architecture decisions. + + + {/* Language cards */} + + + + + + + + + + + + + + + + + + + {/* Difficulty tiers */} + +

+ Difficulty Tiers +

+
+ {/* Easy */} +
+
+ E +
+
+

Easy

+

+ Single-file fixes, straightforward implementations, basic debugging +

+
+
+ + 90–95% + +
+ +
+
+
+ {/* Medium */} +
+
+ M +
+
+

Medium

+

+ Multi-file changes, refactoring, cross-file understanding +

+
+
+ + 60–80% + +
+ +
+
+
+ {/* Hard */} +
+
+ H +
+
+

Hard

+

+ Architecture decisions, ambiguous requirements, complex system design +

+
+
+ + 30–50% + +
+ +
+
+
+
+
+
+ + + {/* ════════════════════════════════════════════════════════════════ + SECTION 03: ENGINEER ROLES + ════════════════════════════════════════════════════════════════ */} + +
+ + + + + + Engineer Roles + + + + Not every task needs the same level of engineering. Three role tiers, each with different + exercise difficulty and scoring weights. + + + {/* Role cards */} + + {/* Junior */} + +
+
+
+ +
+

Junior Engineer

+

+ Easy + Medium exercises. Boilerplate, simple bug fixes, test generation. Scoring + emphasizes{" "} + cost efficiency. +

+ {/* Weight breakdown */} +
+

+ Scoring Weights +

+
+
+
+
+
+
+
+ Success 35% + Quality 15% + Cost 35% + Speed 15% +
+
+
+ + + {/* Senior */} + +
+
+
+ +
+

Senior Engineer

+

+ Medium exercises. Feature development, debugging, code review. Balanced scoring with + emphasis on{" "} + success rate + quality + . +

+ {/* Weight breakdown */} +
+

+ Scoring Weights +

+
+
+
+
+
+
+
+ Success 40% + Quality 25% + Cost 20% + Speed 15% +
+
+
+ + + {/* Staff */} + +
+
+
+ +
+

Staff Engineer

+

+ Hard exercises. Architecture, ambiguous requirements, system design. Scoring + prioritizes{" "} + + reasoning quality + correctness + + . +

+ {/* Weight breakdown */} +
+

+ Scoring Weights +

+
+
+
+
+
+
+
+ Success 45% + Quality 30% + Cost 10% + Speed 15% +
+
+
+ + +
+ + + {/* ════════════════════════════════════════════════════════════════ + SECTION 04: SCORING + ════════════════════════════════════════════════════════════════ */} + + {/* Background glow */} + +
+
+
+ + +
+ + + + + + Scoring + + + + Each model receives a composite score, a weighted + sum of four dimensions normalized to a 0–100 scale. + + + {/* Scoring formula components */} + + + + + + + + + + + + + + + + {/* Tier classification */} + +

+ Recommendation Tiers +

+

+ Composite scores are mapped to recommendation tiers: +

+
+ + + {/* Best */} + +
+ ≥85 +
+
+ + Best + +

Top Performer

+

Highly recommended for this role.

+
+
+ + {/* Recommended */} + +
+ 70–84 +
+
+ + Recommended + +

Solid Choice

+

+ Reliable for most tasks at this level. +

+
+
+ + {/* Situational */} + +
+ 50–69 +
+
+ + Situational + +

Usable with Caveats

+

May struggle in specific areas.

+
+
+ + {/* Not Recommended */} + +
+ <50 +
+
+ + Not Recommended + +

High Failure Rate

+

Not suitable for this role.

+
+
+
+ + + Per-language breakdowns reveal where each model excels or struggles. A model might score well + overall but underperform in Rust, or dominate in Python but lag in Go. + +
+ + + {/* ════════════════════════════════════════════════════════════════ + SECTION 05: RUN YOUR OWN INTERVIEWS + ════════════════════════════════════════════════════════════════ */} + +
+ + + + + + Run Your Own Interviews + + + + Our evaluation framework is fully open source. Run the exact same interviews on your own + infrastructure, with your own API keys, against any model. + + + {/* Terminal card */} + + {/* Terminal header */} +
+
+
+
+
+
+
+ + terminal +
+
+ {/* Terminal body */} +
+
+ ${" "} + git clone{" "} + https://github.com/RooCodeInc/Roo-Code-Evals.git +
+
+ $ cd{" "} + Roo-Code-Evals +
+
+ ${" "} + # Follow the README for setup instructions +
+
+ + + {/* GitHub link */} + + + + + + View on GitHub + + + +
+ + + {/* ════════════════════════════════════════════════════════════════ + SECTION 06: LIMITATIONS + ════════════════════════════════════════════════════════════════ */} + +
+ + + + + + Limitations + + + + Every evaluation has blind spots. These are ours. + + + + {[ + { + title: "Single test environment", + description: + "All evals run in Docker + VS Code. Results may differ in other IDEs or environments.", + }, + { + title: "Expanding exercise coverage", + description: + "Hundreds of exercises, but the suite is continuously growing. Some niche patterns may be underrepresented.", + }, + { + title: "API changes affect results", + description: + "Providers update their models. A model that scored well last month may behave differently after an update.", + }, + { + title: "Point-in-time snapshots", + description: + 'Each eval run captures performance at a specific point. We re-run regularly; check the "last updated" date.', + }, + ].map((item) => ( + + +
+

{item.title}

+

+ {item.description} +

+
+
+ ))} +
+
+
+ + {/* ════════════════════════════════════════════════════════════════ + BOTTOM NAVIGATION + ════════════════════════════════════════════════════════════════ */} +
+
+ + + Ready to see the results? + + + + + View recommendations + + + + + Raw eval data + + + + +
+
+ + ) +} diff --git a/apps/web-roo-code/src/app/evals/methodology/page.tsx b/apps/web-roo-code/src/app/evals/methodology/page.tsx new file mode 100644 index 00000000000..8a0960142cb --- /dev/null +++ b/apps/web-roo-code/src/app/evals/methodology/page.tsx @@ -0,0 +1,58 @@ +import type { Metadata } from "next" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" + +import { MethodologyContent } from "./methodology-content" + +// ── SEO Metadata ──────────────────────────────────────────────────────────── + +const TITLE = "How We Interview AI Models | Roo Code Evals" +const DESCRIPTION = "Our methodology for evaluating AI coding models. Transparent, reproducible, evidence-based." +const OG_DESCRIPTION = "Our methodology for evaluating AI coding models" +const PATH = "/evals/methodology" + +export const metadata: Metadata = { + title: TITLE, + description: DESCRIPTION, + alternates: { + canonical: `${SEO.url}${PATH}`, + }, + openGraph: { + title: TITLE, + description: DESCRIPTION, + url: `${SEO.url}${PATH}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(TITLE, OG_DESCRIPTION), + width: 1200, + height: 630, + alt: TITLE, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title: TITLE, + description: DESCRIPTION, + images: [ogImageUrl(TITLE, OG_DESCRIPTION)], + }, + keywords: [ + ...SEO.keywords, + "AI evaluation", + "model benchmarking", + "coding evals", + "methodology", + "interview process", + "transparent evaluation", + ], +} + +// ── Page Component ────────────────────────────────────────────────────────── + +export default function MethodologyPage() { + return +} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx new file mode 100644 index 00000000000..c4a42a71f8f --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx @@ -0,0 +1,1010 @@ +"use client" + +import { motion } from "framer-motion" +import { + Code, + GitBranch, + Building2, + Search, + Bot, + ArrowRight, + ArrowLeft, + Trophy, + DollarSign, + Zap, + ExternalLink, + CheckCircle2, + AlertTriangle, + FlaskConical, + BarChart3, + Beaker, +} from "lucide-react" +import type { LucideIcon } from "lucide-react" +import Link from "next/link" + +import type { ModelCandidate, LanguageScores, EngineerRole } from "@/lib/mock-recommendations" + +import { CopySettingsButton } from "./copy-settings-button" + +// ── Icon Mapping ──────────────────────────────────────────────────────────── + +const ICON_MAP: Record = { + Code, + GitBranch, + Building2, + Search, + Bot, +} + +// ── Role Color Themes ─────────────────────────────────────────────────────── + +type RoleTheme = { + accent: string + accentLight: string + accentDark: string + iconBg: string + iconText: string + badgeBg: string + badgeText: string + borderHover: string + shadowHover: string + buttonBg: string + buttonHover: string + glowColor: string + gradientFrom: string + gradientVia: string + ringColor: string + scoreText: string + scoreBg: string + blurBg1: string + blurBg2: string + methodologyBorder: string +} + +const ROLE_THEMES: Record = { + junior: { + accent: "emerald", + accentLight: "text-emerald-600", + accentDark: "dark:text-emerald-400", + iconBg: "bg-emerald-100 dark:bg-emerald-900/30", + iconText: "text-emerald-700 dark:text-emerald-300", + badgeBg: "bg-emerald-100 dark:bg-emerald-900/30", + badgeText: "text-emerald-700 dark:text-emerald-300", + borderHover: "hover:border-emerald-500/40 dark:hover:border-emerald-400/30", + shadowHover: "hover:shadow-emerald-500/10 dark:hover:shadow-emerald-400/10", + buttonBg: "bg-emerald-600 dark:bg-emerald-600", + buttonHover: "hover:bg-emerald-700 dark:hover:bg-emerald-500", + glowColor: "bg-emerald-500/8 dark:bg-emerald-600/15", + gradientFrom: "from-emerald-500", + gradientVia: "via-emerald-400", + ringColor: "ring-emerald-500/30", + scoreText: "text-emerald-400", + scoreBg: "bg-emerald-500/10 border-emerald-500/20", + blurBg1: "bg-emerald-500/10 dark:bg-emerald-600/20", + blurBg2: "bg-emerald-400/5 dark:bg-emerald-500/10", + methodologyBorder: "border-emerald-500/30 hover:border-emerald-500/50", + }, + senior: { + accent: "blue", + accentLight: "text-blue-600", + accentDark: "dark:text-blue-400", + iconBg: "bg-blue-100 dark:bg-blue-900/30", + iconText: "text-blue-700 dark:text-blue-300", + badgeBg: "bg-blue-100 dark:bg-blue-900/30", + badgeText: "text-blue-700 dark:text-blue-300", + borderHover: "hover:border-blue-500/40 dark:hover:border-blue-400/30", + shadowHover: "hover:shadow-blue-500/10 dark:hover:shadow-blue-400/10", + buttonBg: "bg-blue-600 dark:bg-blue-600", + buttonHover: "hover:bg-blue-700 dark:hover:bg-blue-500", + glowColor: "bg-blue-500/8 dark:bg-blue-600/15", + gradientFrom: "from-blue-500", + gradientVia: "via-blue-400", + ringColor: "ring-blue-500/30", + scoreText: "text-blue-400", + scoreBg: "bg-blue-500/10 border-blue-500/20", + blurBg1: "bg-blue-500/10 dark:bg-blue-600/20", + blurBg2: "bg-blue-400/5 dark:bg-blue-500/10", + methodologyBorder: "border-blue-500/30 hover:border-blue-500/50", + }, + staff: { + accent: "amber", + accentLight: "text-amber-600", + accentDark: "dark:text-amber-400", + iconBg: "bg-amber-100 dark:bg-amber-900/30", + iconText: "text-amber-700 dark:text-amber-300", + badgeBg: "bg-amber-100 dark:bg-amber-900/30", + badgeText: "text-amber-700 dark:text-amber-300", + borderHover: "hover:border-amber-500/40 dark:hover:border-amber-400/30", + shadowHover: "hover:shadow-amber-500/10 dark:hover:shadow-amber-400/10", + buttonBg: "bg-amber-600 dark:bg-amber-600", + buttonHover: "hover:bg-amber-700 dark:hover:bg-amber-500", + glowColor: "bg-amber-500/8 dark:bg-amber-600/15", + gradientFrom: "from-amber-500", + gradientVia: "via-amber-400", + ringColor: "ring-amber-500/30", + scoreText: "text-amber-400", + scoreBg: "bg-amber-500/10 border-amber-500/20", + blurBg1: "bg-amber-500/10 dark:bg-amber-600/20", + blurBg2: "bg-amber-400/5 dark:bg-amber-500/10", + methodologyBorder: "border-amber-500/30 hover:border-amber-500/50", + }, + reviewer: { + accent: "violet", + accentLight: "text-violet-600", + accentDark: "dark:text-violet-400", + iconBg: "bg-violet-100 dark:bg-violet-900/30", + iconText: "text-violet-700 dark:text-violet-300", + badgeBg: "bg-violet-100 dark:bg-violet-900/30", + badgeText: "text-violet-700 dark:text-violet-300", + borderHover: "hover:border-violet-500/40 dark:hover:border-violet-400/30", + shadowHover: "hover:shadow-violet-500/10 dark:hover:shadow-violet-400/10", + buttonBg: "bg-violet-600 dark:bg-violet-600", + buttonHover: "hover:bg-violet-700 dark:hover:bg-violet-500", + glowColor: "bg-violet-500/8 dark:bg-violet-600/15", + gradientFrom: "from-violet-500", + gradientVia: "via-violet-400", + ringColor: "ring-violet-500/30", + scoreText: "text-violet-400", + scoreBg: "bg-violet-500/10 border-violet-500/20", + blurBg1: "bg-violet-500/10 dark:bg-violet-600/20", + blurBg2: "bg-violet-400/5 dark:bg-violet-500/10", + methodologyBorder: "border-violet-500/30 hover:border-violet-500/50", + }, + autonomous: { + accent: "cyan", + accentLight: "text-cyan-600", + accentDark: "dark:text-cyan-400", + iconBg: "bg-cyan-100 dark:bg-cyan-900/30", + iconText: "text-cyan-700 dark:text-cyan-300", + badgeBg: "bg-cyan-100 dark:bg-cyan-900/30", + badgeText: "text-cyan-700 dark:text-cyan-300", + borderHover: "hover:border-cyan-500/40 dark:hover:border-cyan-400/30", + shadowHover: "hover:shadow-cyan-500/10 dark:hover:shadow-cyan-400/10", + buttonBg: "bg-cyan-600 dark:bg-cyan-600", + buttonHover: "hover:bg-cyan-700 dark:hover:bg-cyan-500", + glowColor: "bg-cyan-500/8 dark:bg-cyan-600/15", + gradientFrom: "from-cyan-500", + gradientVia: "via-cyan-400", + ringColor: "ring-cyan-500/30", + scoreText: "text-cyan-400", + scoreBg: "bg-cyan-500/10 border-cyan-500/20", + blurBg1: "bg-cyan-500/10 dark:bg-cyan-600/20", + blurBg2: "bg-cyan-400/5 dark:bg-cyan-500/10", + methodologyBorder: "border-cyan-500/30 hover:border-cyan-500/50", + }, +} + +const DEFAULT_THEME = ROLE_THEMES.senior! + +// ── Framer Motion Variants ────────────────────────────────────────────────── + +const containerVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + staggerChildren: 0.12, + delayChildren: 0.1, + }, + }, +} + +const cardVariants = { + hidden: { opacity: 0, y: 30 }, + visible: { + opacity: 1, + y: 0, + transition: { + duration: 0.6, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +const fadeUpVariants = { + hidden: { opacity: 0, y: 20 }, + visible: { + opacity: 1, + y: 0, + transition: { + duration: 0.6, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +const backgroundVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + duration: 1.2, + ease: "easeOut" as const, + }, + }, +} + +const tableRowVariants = { + hidden: { opacity: 0, x: -10 }, + visible: { + opacity: 1, + x: 0, + transition: { + duration: 0.4, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +// ── Helpers ───────────────────────────────────────────────────────────────── + +function scoreBadgeColor(score: number): string { + if (score >= 85) return "bg-green-500/10 text-green-400 border border-green-500/20" + if (score >= 70) return "bg-blue-500/10 text-blue-400 border border-blue-500/20" + if (score >= 50) return "bg-yellow-500/10 text-yellow-400 border border-yellow-500/20" + return "bg-red-500/10 text-red-400 border border-red-500/20" +} + +function tierBadge(tier: ModelCandidate["tier"]): { label: string; className: string } { + switch (tier) { + case "best": + return { + label: "Best", + className: "bg-green-500/10 text-green-400 border border-green-500/20", + } + case "recommended": + return { + label: "Recommended", + className: "bg-blue-500/10 text-blue-400 border border-blue-500/20", + } + case "situational": + return { + label: "Situational", + className: "bg-yellow-500/10 text-yellow-400 border border-yellow-500/20", + } + case "not-recommended": + return { + label: "Not Recommended", + className: "bg-red-500/10 text-red-400 border border-red-500/20", + } + } +} + +const RANK_BADGES = ["🥇", "🥈", "🥉"] + +const LANGUAGE_CONFIG: { key: keyof LanguageScores; label: string; color: string; bgColor: string }[] = [ + { key: "python", label: "Python", color: "bg-green-500", bgColor: "bg-green-500/20" }, + { key: "javascript", label: "JS", color: "bg-yellow-500", bgColor: "bg-yellow-500/20" }, + { key: "java", label: "Java", color: "bg-orange-500", bgColor: "bg-orange-500/20" }, + { key: "go", label: "Go", color: "bg-cyan-500", bgColor: "bg-cyan-500/20" }, + { key: "rust", label: "Rust", color: "bg-red-500", bgColor: "bg-red-500/20" }, +] + +function settingsLabel(candidate: ModelCandidate): string { + const parts = [`temp=${candidate.settings.temperature}`] + if (candidate.settings.reasoningEffort) { + parts.push(`reasoning=${candidate.settings.reasoningEffort}`) + } + return parts.join(", ") +} + +// ── Language Score Bars ───────────────────────────────────────────────────── + +function LanguageBars({ scores }: { scores: LanguageScores }) { + return ( +
+ {LANGUAGE_CONFIG.map(({ key, label, color, bgColor }) => { + const value = scores[key] + return ( +
+ {label} +
+
+ +
+
+ {value} +
+ ) + })} +
+ ) +} + +// ── Composite Score Ring ──────────────────────────────────────────────────── + +function ScoreRing({ score, theme }: { score: number; theme: RoleTheme }) { + const circumference = 2 * Math.PI * 40 + const strokeDashoffset = circumference - (score / 100) * circumference + + return ( +
+ + + + +
+ {score} +
+
+ ) +} + +// ── Candidate Card ────────────────────────────────────────────────────────── + +function CandidateCard({ + candidate, + rank, + theme, + cloudUrl, + highlight, +}: { + candidate: ModelCandidate + rank?: number + theme: RoleTheme + cloudUrl: string + highlight?: "cost" | "speed" +}) { + const tier = tierBadge(candidate.tier) + const copySettings = { + provider: candidate.provider, + model: candidate.modelId, + temperature: candidate.settings.temperature, + ...(candidate.settings.reasoningEffort ? { reasoningEffort: candidate.settings.reasoningEffort } : {}), + } + + return ( + + {/* Subtle glow on hover */} +
+ +
+ {/* Rank badge */} + {rank !== undefined && rank < 3 && ( +
+ {RANK_BADGES[rank]} +
+ )} + + {/* Provider label */} +

+ {candidate.provider} +

+ + {/* Model name */} +

{candidate.displayName}

+ + {/* Tier pill */} +
+ + {tier.label} + +
+ + {/* Score ring */} +
+ +
+ + {/* Key metrics grid */} +
+
+

Success

+

{candidate.successRate}%

+
+
+

+ Daily Cost +

+

+ ~${Math.round(candidate.estimatedDailyCost)}/day +

+

+ (${candidate.avgCostPerTask.toFixed(3)}/task) +

+
+
+

Avg Time

+

+ {candidate.avgTimePerTask.toFixed(1)}s +

+
+
+ + {/* Per-language breakdown */} +
+

+ Language Scores +

+ +
+ + {/* Recommended settings */} +
+ {settingsLabel(candidate)} +
+ + {/* Caveats */} + {candidate.caveats && candidate.caveats.length > 0 && ( +
+ {candidate.caveats.map((caveat) => ( +

+ + {caveat} +

+ ))} +
+ )} + + {/* CTAs */} + +
+ + ) +} + +// ── Compact Card (Budget / Speed) ─────────────────────────────────────────── + +function CompactCard({ + candidate, + label, + icon: IconComp, + highlight, + theme, + cloudUrl, +}: { + candidate: ModelCandidate + label: string + icon: LucideIcon + highlight: "cost" | "speed" + theme: RoleTheme + cloudUrl: string +}) { + const tier = tierBadge(candidate.tier) + const copySettings = { + provider: candidate.provider, + model: candidate.modelId, + temperature: candidate.settings.temperature, + ...(candidate.settings.reasoningEffort ? { reasoningEffort: candidate.settings.reasoningEffort } : {}), + } + + return ( + + {/* Subtle glow on hover */} +
+ +
+ {/* Label header */} +
+
+ +
+

{label}

+
+ +
+ {/* Left: model info + score */} +
+

+ {candidate.provider} +

+

{candidate.displayName}

+ + {/* Score + tier */} +
+ {candidate.compositeScore} + + {tier.label} + +
+
+ + {/* Right: highlighted metric */} +
+

+ {highlight === "cost" ? "Daily Cost" : "Avg Time"} +

+

+ {highlight === "cost" + ? `~$${Math.round(candidate.estimatedDailyCost)}/day` + : `${candidate.avgTimePerTask.toFixed(1)}s`} +

+ {highlight === "cost" && ( +

+ (${candidate.avgCostPerTask.toFixed(3)}/task) +

+ )} +
+
+ + {/* Metrics row */} +
+
+

Success

+

{candidate.successRate}%

+
+
+

+ Daily Cost +

+

+ ~${Math.round(candidate.estimatedDailyCost)} +

+

+ (${candidate.avgCostPerTask.toFixed(3)}/task) +

+
+
+

Time

+

{candidate.avgTimePerTask.toFixed(1)}s

+
+
+ + {/* Language bars */} +
+ +
+ + {/* Settings */} +
+ {settingsLabel(candidate)} +
+ + {/* CTAs */} + +
+ + ) +} + +// ── Props ─────────────────────────────────────────────────────────────────── + +export type CandidatesContentProps = { + roleId: string + role: EngineerRole + best: ModelCandidate[] + budgetHire: ModelCandidate | null + speedHire: ModelCandidate | null + allCandidates: ModelCandidate[] + totalEvalRuns: number + totalExercises: number + lastUpdated: string + cloudUrls: Record +} + +// ── Main Content Component ────────────────────────────────────────────────── + +export function CandidatesContent({ + roleId, + role, + best, + budgetHire, + speedHire, + allCandidates, + totalEvalRuns, + totalExercises, + lastUpdated, + cloudUrls, +}: CandidatesContentProps) { + const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME + const IconComponent = ICON_MAP[role.icon] ?? Code + + return ( + <> + {/* ── Role Header ────────────────────────────────────────────── */} +
+ {/* Atmospheric blur gradient background */} + +
+
+
+
+
+ + +
+ + {/* Breadcrumb */} + + + Evals + + / + + Hire an AI Engineer + + / + {role.name} + + + {/* Icon + Title row */} + +
+ +
+
+

{role.name}

+ + {role.salaryRange} + +

+ {role.description} +

+
+
+ + {/* Stats bar */} + + + + + {totalEvalRuns.toLocaleString()} + + eval runs + +
+ + + + {totalExercises.toLocaleString()} + + exercises + +
+ + Updated{" "} + + {new Date(lastUpdated).toLocaleDateString("en-US", { + month: "short", + day: "numeric", + year: "numeric", + })} + + +
+ + + How we interview + + + + + {/* Strengths + Trade-offs grid */} + +
+

+ Strengths +

+ {role.strengths.map((s) => ( +
+ + {s} +
+ ))} +
+
+

+ Trade-offs +

+ {role.weaknesses.map((w) => ( +
+ + {w} +
+ ))} +
+
+ +
+
+ + {/* ── Top Candidates: Best Overall ────────────────────────────── */} +
+
+ + + + Top Candidates + + + + {best.map((candidate, i) => ( + + ))} + + +
+
+ + {/* ── Budget & Speed Hire ─────────────────────────────────────── */} + {(budgetHire || speedHire) && ( +
+
+ + {budgetHire && ( + + )} + {speedHire && ( + + )} + +
+
+ )} + + {/* ── All Candidates Table ────────────────────────────────────── */} +
+ {/* Subtle background */} + +
+
+
+ + +
+ + + All Candidates + + + + + + + + + + + + + + + + + + {allCandidates.map((candidate, i) => { + const tier = tierBadge(candidate.tier) + return ( + + + + + + + + + + + ) + })} + +
+ # + + Model + + Provider + + Score + + Tier + + Success + + Daily Cost + + Time +
+ {i + 1} + {candidate.displayName} + {candidate.provider} + + + {candidate.compositeScore} + + + + {tier.label} + + + {candidate.successRate}% + + ~${Math.round(candidate.estimatedDailyCost)} + + (${candidate.avgCostPerTask.toFixed(3)}) + + + {candidate.avgTimePerTask.toFixed(1)}s +
+
+ + {/* Compare link */} + + + 📊 Compare all candidates + + + +
+
+
+ + {/* ── Bottom Navigation ───────────────────────────────────────── */} +
+
+ + + + + Back to all roles + + + + + 📊 Compare candidates + + + 📋 Raw eval data + + + + +
+
+ + ) +} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx new file mode 100644 index 00000000000..4b87bbb1cf9 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -0,0 +1,443 @@ +"use client" + +import { useState, useMemo, useCallback } from "react" +import Link from "next/link" +import { ArrowLeft, Copy, Check, FileJson, FileSpreadsheet } from "lucide-react" +import { BarChart, Bar, XAxis, YAxis, Tooltip, ResponsiveContainer, Legend } from "recharts" + +import type { ModelCandidate, LanguageScores, EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations" +import { TASKS_PER_DAY } from "@/lib/mock-recommendations" + +// ── Constants ─────────────────────────────────────────────────────────────── + +const LANGUAGES: { key: keyof LanguageScores; label: string }[] = [ + { key: "go", label: "Go" }, + { key: "java", label: "Java" }, + { key: "javascript", label: "JavaScript" }, + { key: "python", label: "Python" }, + { key: "rust", label: "Rust" }, +] + +const PROVIDERS = ["anthropic", "openai", "google", "deepseek", "groq", "alibaba", "mistral"] as const + +const PROVIDER_LABELS: Record = { + anthropic: "Anthropic", + openai: "OpenAI", + google: "Google", + deepseek: "DeepSeek", + groq: "Meta/Groq", + alibaba: "Alibaba", + mistral: "Mistral", +} + +const DIMENSION_COLORS = { + composite: "#3b82f6", // blue + success: "#22c55e", // green + cost: "#f59e0b", // amber + speed: "#a855f7", // purple +} + +// ── Helpers ───────────────────────────────────────────────────────────────── + +/** Normalize cost: lower cost → higher bar (0–100). */ +function normalizeCost(cost: number, maxCost: number): number { + if (maxCost === 0) return 100 + return Math.round((1 - cost / maxCost) * 100) +} + +/** Normalize speed: lower time → higher bar (0–100). */ +function normalizeSpeed(time: number, maxTime: number): number { + if (maxTime === 0) return 100 + return Math.round((1 - time / maxTime) * 100) +} + +function buildChartData( + candidates: ModelCandidate[], + language: keyof LanguageScores | "all", + maxCost: number, + maxTime: number, +) { + return candidates.map((c) => ({ + name: c.displayName, + composite: language === "all" ? c.compositeScore : c.languageScores[language], + success: c.successRate, + costEfficiency: normalizeCost(c.avgCostPerTask, maxCost), + speed: normalizeSpeed(c.avgTimePerTask, maxTime), + // raw daily cost for tooltip display + dailyCost: Math.round(c.estimatedDailyCost), + costPerTask: c.avgCostPerTask, + // raw data for export + _raw: c, + })) +} + +function candidateToCsvRow(c: ModelCandidate): string { + return [ + c.provider, + c.modelId, + c.displayName, + c.compositeScore, + c.successRate, + c.avgCostPerTask, + Math.round(c.estimatedDailyCost), + c.avgTimePerTask, + c.languageScores.go, + c.languageScores.java, + c.languageScores.javascript, + c.languageScores.python, + c.languageScores.rust, + c.tier, + `"${c.settings.temperature}"`, + `"${c.settings.reasoningEffort ?? ""}"`, + ].join(",") +} + +function downloadBlob(content: string, filename: string, mimeType: string) { + const blob = new Blob([content], { type: mimeType }) + const url = URL.createObjectURL(blob) + const a = document.createElement("a") + a.href = url + a.download = filename + document.body.appendChild(a) + a.click() + document.body.removeChild(a) + URL.revokeObjectURL(url) +} + +// ── Custom Tooltip ────────────────────────────────────────────────────────── + +function CustomTooltip({ + active, + payload, + label, +}: { + active?: boolean + // eslint-disable-next-line @typescript-eslint/no-explicit-any + payload?: any[] + label?: string +}) { + if (!active || !payload || !payload.length) return null + + // Extract raw daily cost from first payload entry's data + const rawData = payload[0]?.payload as { dailyCost?: number; costPerTask?: number } | undefined + const dailyCost = rawData?.dailyCost + const costPerTask = rawData?.costPerTask + + return ( +
+

{label}

+ {payload.map( + ( + entry: { + name: string + value: number + color: string + dataKey: string + }, + index: number, + ) => ( +
+ + {entry.name}: + + {entry.dataKey === "costEfficiency" && dailyCost !== undefined + ? `${entry.value} (~$${dailyCost}/day · $${costPerTask?.toFixed(3)}/task)` + : entry.value} + +
+ ), + )} +
+ ) +} + +// ── Main Component ────────────────────────────────────────────────────────── + +interface ComparisonChartProps { + recommendation: RoleRecommendation + role: EngineerRole + roleId: string +} + +export function ComparisonChart({ recommendation, role, roleId }: ComparisonChartProps) { + const { allCandidates } = recommendation + + // ── State ─────────────────────────────────────────────────────────────── + const [selectedLanguage, setSelectedLanguage] = useState("all") + const [enabledProviders, setEnabledProviders] = useState>(() => new Set(PROVIDERS)) + const [minSuccessRate, setMinSuccessRate] = useState(0) + const [copiedSettings, setCopiedSettings] = useState(false) + + // ── Derived ───────────────────────────────────────────────────────────── + + const filteredCandidates = useMemo( + () => allCandidates.filter((c) => enabledProviders.has(c.provider) && c.successRate >= minSuccessRate), + [allCandidates, enabledProviders, minSuccessRate], + ) + + const maxCost = useMemo(() => Math.max(...allCandidates.map((c) => c.avgCostPerTask), 0.001), [allCandidates]) + + const maxTime = useMemo(() => Math.max(...allCandidates.map((c) => c.avgTimePerTask), 0.1), [allCandidates]) + + const chartData = useMemo( + () => buildChartData(filteredCandidates, selectedLanguage, maxCost, maxTime), + [filteredCandidates, selectedLanguage, maxCost, maxTime], + ) + + const chartHeight = Math.max(300, chartData.length * 60 + 80) + + // ── Handlers ──────────────────────────────────────────────────────────── + + const toggleProvider = useCallback((provider: string) => { + setEnabledProviders((prev) => { + const next = new Set(prev) + if (next.has(provider)) { + next.delete(provider) + } else { + next.add(provider) + } + return next + }) + }, []) + + const handleCopySettings = useCallback(async () => { + const settings = filteredCandidates.map((c) => ({ + provider: c.provider, + model: c.modelId, + displayName: c.displayName, + temperature: c.settings.temperature, + ...(c.settings.reasoningEffort ? { reasoningEffort: c.settings.reasoningEffort } : {}), + })) + await navigator.clipboard.writeText(JSON.stringify(settings, null, 2)) + setCopiedSettings(true) + setTimeout(() => setCopiedSettings(false), 2000) + }, [filteredCandidates]) + + const handleExportCsv = useCallback(() => { + const header = + "Provider,Model ID,Display Name,Composite Score,Success Rate,Avg Cost/Task,Est. Daily Cost,Avg Time/Task,Go,Java,JavaScript,Python,Rust,Tier,Temperature,Reasoning Effort" + const rows = filteredCandidates.map(candidateToCsvRow) + const csv = [header, ...rows].join("\n") + downloadBlob(csv, `${roleId}-comparison.csv`, "text/csv") + }, [filteredCandidates, roleId]) + + const handleExportJson = useCallback(() => { + const json = JSON.stringify(filteredCandidates, null, 2) + downloadBlob(json, `${roleId}-comparison.json`, "application/json") + }, [filteredCandidates, roleId]) + + // ── Render ────────────────────────────────────────────────────────────── + + return ( +
+ {/* ── Breadcrumb ─────────────────────────────────────────────── */} + + + {/* ── Page Header ────────────────────────────────────────────── */} +
+

Compare Candidates — {role.name}

+

+ Interactive comparison across composite score, success rate, cost efficiency, and speed. +

+
+ + {/* ── Language Toggle ─────────────────────────────────────────── */} +
+

+ Score View +

+
+ + {LANGUAGES.map(({ key, label }) => ( + + ))} +
+
+ + {/* ── Filters ────────────────────────────────────────────────── */} +
+ {/* Provider checkboxes */} +
+

+ Providers +

+
+ {PROVIDERS.map((p) => ( + + ))} +
+
+ + {/* Min success rate slider */} +
+

+ Min Success Rate +

+
+ setMinSuccessRate(Number(e.target.value))} + className="h-2 flex-1 cursor-pointer accent-blue-600" + /> + {minSuccessRate}% +
+
+
+ + {/* ── Chart ──────────────────────────────────────────────────── */} +
+

+ {selectedLanguage === "all" + ? "Composite Score" + : `${LANGUAGES.find((l) => l.key === selectedLanguage)?.label} Score`}{" "} + Comparison +

+

+ Cost Efficiency and Speed are inverted — higher bars mean cheaper / faster. Daily costs assume ~ + {TASKS_PER_DAY} tasks per agent per day (~6 productive hours). +

+ + {chartData.length === 0 ? ( +
+ No candidates match the current filters. +
+ ) : ( + + + `${v}`} /> + + } /> + + l.key === selectedLanguage)?.label ?? "Language"} Score` + } + fill={DIMENSION_COLORS.composite} + radius={[0, 4, 4, 0]} + barSize={12} + /> + + + + + + )} +
+ + {/* ── Export Buttons ──────────────────────────────────────────── */} +
+ + + +
+ + {/* ── Bottom Navigation ───────────────────────────────────────── */} + +
+ ) +} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx new file mode 100644 index 00000000000..d79b22eedda --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx @@ -0,0 +1,81 @@ +import { notFound } from "next/navigation" +import type { Metadata } from "next" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" +import { getEngineerRole, getRoleRecommendation } from "@/lib/mock-recommendations" + +import { ComparisonChart } from "./comparison-chart" + +// ── SEO Metadata ──────────────────────────────────────────────────────────── + +type PageProps = { params: Promise<{ roleId: string }> } + +export async function generateMetadata({ params }: PageProps): Promise { + const { roleId } = await params + const role = getEngineerRole(roleId) + + if (!role) { + return { + title: "Role Not Found | Roo Code Evals", + description: "The requested engineer role was not found.", + } + } + + const title = `Compare Candidates — ${role.name} | Roo Code Evals` + const description = `Interactive comparison of AI model candidates for the ${role.name} role. Compare composite score, success rate, cost efficiency, and speed.` + const ogDescription = `Compare Candidates — ${role.name}` + const path = `/evals/workers/${roleId}/compare` + + return { + title, + description, + alternates: { + canonical: `${SEO.url}${path}`, + }, + openGraph: { + title, + description, + url: `${SEO.url}${path}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(title, ogDescription), + width: 1200, + height: 630, + alt: title, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title, + description, + images: [ogImageUrl(title, ogDescription)], + }, + keywords: [ + ...SEO.keywords, + "AI engineer", + "model comparison", + "coding evals", + role.name.toLowerCase(), + "bar chart", + "candidate comparison", + ], + } +} + +// ── Page Component ────────────────────────────────────────────────────────── + +export default async function CompareCandidatesPage({ params }: PageProps) { + const { roleId } = await params + const recommendation = getRoleRecommendation(roleId) + + if (!recommendation) { + notFound() + } + + return +} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx new file mode 100644 index 00000000000..06adacff5a2 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx @@ -0,0 +1,42 @@ +"use client" + +import { useState } from "react" +import { Copy, Check } from "lucide-react" + +interface CopySettingsButtonProps { + settings: { + provider: string + model: string + temperature: number + reasoningEffort?: string + } +} + +export function CopySettingsButton({ settings }: CopySettingsButtonProps) { + const [copied, setCopied] = useState(false) + + const handleCopy = async () => { + const json = JSON.stringify(settings, null, 2) + await navigator.clipboard.writeText(json) + setCopied(true) + setTimeout(() => setCopied(false), 2000) + } + + return ( + + ) +} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx new file mode 100644 index 00000000000..8daa554cd2d --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx @@ -0,0 +1,104 @@ +import { notFound } from "next/navigation" +import type { Metadata } from "next" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" +import { getRoleRecommendation, getCloudSetupUrl } from "@/lib/mock-recommendations" + +import { CandidatesContent } from "./candidates-content" + +// ── SEO Metadata ──────────────────────────────────────────────────────────── + +type PageProps = { params: Promise<{ roleId: string }> } + +export async function generateMetadata({ params }: PageProps): Promise { + const { roleId } = await params + const recommendation = getRoleRecommendation(roleId) + + if (!recommendation) { + return { + title: "Role Not Found | Roo Code Evals", + description: "The requested engineer role was not found.", + } + } + + const { role } = recommendation + const title = `${role.name} — AI Engineer Candidates | Roo Code Evals` + const description = `Interview results for ${role.name} AI candidates. Compare models by success rate, cost, and speed across 5 languages.` + const ogDescription = `${role.name} — AI Engineer Candidates` + const path = `/evals/workers/${roleId}` + + return { + title, + description, + alternates: { + canonical: `${SEO.url}${path}`, + }, + openGraph: { + title, + description, + url: `${SEO.url}${path}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(title, ogDescription), + width: 1200, + height: 630, + alt: title, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title, + description, + images: [ogImageUrl(title, ogDescription)], + }, + keywords: [ + ...SEO.keywords, + "AI engineer", + "model recommendations", + "coding evals", + role.name.toLowerCase(), + "model comparison", + ], + } +} + +// ── Page Component ────────────────────────────────────────────────────────── + +export default async function RoleCandidatesPage({ params }: PageProps) { + const { roleId } = await params + const recommendation = getRoleRecommendation(roleId) + + if (!recommendation) { + notFound() + } + + const { role, best, budgetHire, speedHire, allCandidates, totalEvalRuns, totalExercises, lastUpdated } = + recommendation + + // Pre-compute cloud URLs on the server so the client component receives + // only serializable data (no functions). + const cloudUrls: Record = {} + for (const candidate of allCandidates) { + cloudUrls[candidate.modelId] = getCloudSetupUrl(candidate) + } + + return ( + + ) +} diff --git a/apps/web-roo-code/src/app/evals/workers/page.tsx b/apps/web-roo-code/src/app/evals/workers/page.tsx new file mode 100644 index 00000000000..d7a8f5cc17a --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers/page.tsx @@ -0,0 +1,85 @@ +import type { Metadata } from "next" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" +import { getEngineerRoles, getAllRecommendations } from "@/lib/mock-recommendations" + +import { WorkersContent } from "./workers-content" + +// ── SEO Metadata ──────────────────────────────────────────────────────────── + +const TITLE = "Hire an AI Engineer | Roo Code Evals" +const DESCRIPTION = + "Find the right AI coding model for your team. Compare interview results across Junior, Senior, and Staff Engineer roles." +const OG_DESCRIPTION = "Find the right AI coding model for your team" +const PATH = "/evals/workers" + +export const metadata: Metadata = { + title: TITLE, + description: DESCRIPTION, + alternates: { + canonical: `${SEO.url}${PATH}`, + }, + openGraph: { + title: TITLE, + description: DESCRIPTION, + url: `${SEO.url}${PATH}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(TITLE, OG_DESCRIPTION), + width: 1200, + height: 630, + alt: TITLE, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title: TITLE, + description: DESCRIPTION, + images: [ogImageUrl(TITLE, OG_DESCRIPTION)], + }, + keywords: [ + ...SEO.keywords, + "AI engineer", + "model recommendations", + "coding evals", + "model comparison", + "hire AI", + "talent marketplace", + ], +} + +// ── Page Component ────────────────────────────────────────────────────────── + +export default function HireAnAIEngineerPage() { + const roles = getEngineerRoles() + const recommendations = getAllRecommendations() + + // Aggregate totals + const totalEvalRuns = recommendations.reduce((sum, r) => sum + r.totalEvalRuns, 0) + const totalExercises = recommendations.reduce((sum, r) => sum + r.totalExercises, 0) + + // Unique model count across all roles + const uniqueModels = new Set(recommendations.flatMap((r) => r.allCandidates.map((c) => c.modelId))) + const totalModels = uniqueModels.size + + const lastUpdated = recommendations + .map((r) => r.lastUpdated) + .sort() + .pop() + + return ( + + ) +} diff --git a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx new file mode 100644 index 00000000000..ee0618eb738 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx @@ -0,0 +1,519 @@ +"use client" + +import { motion } from "framer-motion" +import { + Code, + GitBranch, + Building2, + Search, + Bot, + ArrowRight, + ChevronDown, + CheckCircle2, + AlertTriangle, + Users, + FlaskConical, + Beaker, + Globe, + TrendingUp, +} from "lucide-react" +import type { LucideIcon } from "lucide-react" +import Link from "next/link" + +import type { EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations" +import { TASKS_PER_DAY } from "@/lib/mock-recommendations" + +// ── Icon Mapping ──────────────────────────────────────────────────────────── + +const ICON_MAP: Record = { + Code, + GitBranch, + Building2, + Search, + Bot, +} + +// ── Color Themes per Role ─────────────────────────────────────────────────── + +type RoleTheme = { + accent: string + accentLight: string + accentDark: string + iconBg: string + iconText: string + badgeBg: string + badgeText: string + borderHover: string + shadowHover: string + buttonBg: string + buttonHover: string + glowColor: string + dotColor: string + strengthColor: string +} + +const ROLE_THEMES: Record = { + junior: { + accent: "emerald", + accentLight: "text-emerald-600", + accentDark: "dark:text-emerald-400", + iconBg: "bg-emerald-100 dark:bg-emerald-900/30", + iconText: "text-emerald-700 dark:text-emerald-300", + badgeBg: "bg-emerald-100 dark:bg-emerald-900/30", + badgeText: "text-emerald-700 dark:text-emerald-300", + borderHover: "hover:border-emerald-500/40 dark:hover:border-emerald-400/30", + shadowHover: "hover:shadow-emerald-500/10 dark:hover:shadow-emerald-400/10", + buttonBg: "bg-emerald-600 dark:bg-emerald-600", + buttonHover: "hover:bg-emerald-700 dark:hover:bg-emerald-500", + glowColor: "bg-emerald-500/8 dark:bg-emerald-600/15", + dotColor: "bg-emerald-500", + strengthColor: "text-emerald-600 dark:text-emerald-400", + }, + senior: { + accent: "blue", + accentLight: "text-blue-600", + accentDark: "dark:text-blue-400", + iconBg: "bg-blue-100 dark:bg-blue-900/30", + iconText: "text-blue-700 dark:text-blue-300", + badgeBg: "bg-blue-100 dark:bg-blue-900/30", + badgeText: "text-blue-700 dark:text-blue-300", + borderHover: "hover:border-blue-500/40 dark:hover:border-blue-400/30", + shadowHover: "hover:shadow-blue-500/10 dark:hover:shadow-blue-400/10", + buttonBg: "bg-blue-600 dark:bg-blue-600", + buttonHover: "hover:bg-blue-700 dark:hover:bg-blue-500", + glowColor: "bg-blue-500/8 dark:bg-blue-600/15", + dotColor: "bg-blue-500", + strengthColor: "text-blue-600 dark:text-blue-400", + }, + staff: { + accent: "amber", + accentLight: "text-amber-600", + accentDark: "dark:text-amber-400", + iconBg: "bg-amber-100 dark:bg-amber-900/30", + iconText: "text-amber-700 dark:text-amber-300", + badgeBg: "bg-amber-100 dark:bg-amber-900/30", + badgeText: "text-amber-700 dark:text-amber-300", + borderHover: "hover:border-amber-500/40 dark:hover:border-amber-400/30", + shadowHover: "hover:shadow-amber-500/10 dark:hover:shadow-amber-400/10", + buttonBg: "bg-amber-600 dark:bg-amber-600", + buttonHover: "hover:bg-amber-700 dark:hover:bg-amber-500", + glowColor: "bg-amber-500/8 dark:bg-amber-600/15", + dotColor: "bg-amber-500", + strengthColor: "text-amber-600 dark:text-amber-400", + }, + reviewer: { + accent: "violet", + accentLight: "text-violet-600", + accentDark: "dark:text-violet-400", + iconBg: "bg-violet-100 dark:bg-violet-900/30", + iconText: "text-violet-700 dark:text-violet-300", + badgeBg: "bg-violet-100 dark:bg-violet-900/30", + badgeText: "text-violet-700 dark:text-violet-300", + borderHover: "hover:border-violet-500/40 dark:hover:border-violet-400/30", + shadowHover: "hover:shadow-violet-500/10 dark:hover:shadow-violet-400/10", + buttonBg: "bg-violet-600 dark:bg-violet-600", + buttonHover: "hover:bg-violet-700 dark:hover:bg-violet-500", + glowColor: "bg-violet-500/8 dark:bg-violet-600/15", + dotColor: "bg-violet-500", + strengthColor: "text-violet-600 dark:text-violet-400", + }, + autonomous: { + accent: "cyan", + accentLight: "text-cyan-600", + accentDark: "dark:text-cyan-400", + iconBg: "bg-cyan-100 dark:bg-cyan-900/30", + iconText: "text-cyan-700 dark:text-cyan-300", + badgeBg: "bg-cyan-100 dark:bg-cyan-900/30", + badgeText: "text-cyan-700 dark:text-cyan-300", + borderHover: "hover:border-cyan-500/40 dark:hover:border-cyan-400/30", + shadowHover: "hover:shadow-cyan-500/10 dark:hover:shadow-cyan-400/10", + buttonBg: "bg-cyan-600 dark:bg-cyan-600", + buttonHover: "hover:bg-cyan-700 dark:hover:bg-cyan-500", + glowColor: "bg-cyan-500/8 dark:bg-cyan-600/15", + dotColor: "bg-cyan-500", + strengthColor: "text-cyan-600 dark:text-cyan-400", + }, +} + +const DEFAULT_THEME = ROLE_THEMES.senior! + +// ── Framer Motion Variants ────────────────────────────────────────────────── + +const containerVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + staggerChildren: 0.15, + delayChildren: 0.2, + }, + }, +} + +const cardVariants = { + hidden: { opacity: 0, y: 30 }, + visible: { + opacity: 1, + y: 0, + transition: { + duration: 0.6, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +const fadeUpVariants = { + hidden: { opacity: 0, y: 20 }, + visible: { + opacity: 1, + y: 0, + transition: { + duration: 0.6, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +const backgroundVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + duration: 1.2, + ease: "easeOut" as const, + }, + }, +} + +// ── Sub-Components ────────────────────────────────────────────────────────── + +function StatPill({ icon: Icon, value, label }: { icon: LucideIcon; value: string; label: string }) { + return ( +
+ + {value} + {label} +
+ ) +} + +// ── Main Content Component ────────────────────────────────────────────────── + +type WorkersContentProps = { + roles: EngineerRole[] + recommendations: RoleRecommendation[] + totalEvalRuns: number + totalExercises: number + totalModels: number + lastUpdated: string | undefined +} + +export function WorkersContent({ + roles, + recommendations, + totalEvalRuns, + totalExercises, + totalModels, + lastUpdated, +}: WorkersContentProps) { + const recByRole = new Map(recommendations.map((r) => [r.roleId, r])) + + return ( + <> + {/* ── Hero Section ───────────────────────────────────────────── */} +
+ {/* Atmospheric blur background */} + +
+
+
+
+
+ + + {/* Gradient fade from hero atmosphere to cards */} +
+ +
+ + {/* Badge */} + + + + How we interview AI models + + + + + {/* Heading */} + + Hire an{" "} + + AI Engineer + + + + {/* Subheading */} + + Every model runs the same coding tasks, same tools, same time limit. Pick the right + candidate for your team and budget. + + + {/* Stats bar */} + + +
+ +
+ +
+ + + +
+
+ + {/* ── Role Cards Grid ────────────────────────────────────────── */} +
+ {/* Subtle section background */} + +
+
+
+ + +
+ {/* Section connector */} + +

+ Choose your agentic team member +

+ +
+ + + {roles.map((role) => { + const rec = recByRole.get(role.id) + const IconComponent = ICON_MAP[role.icon] ?? Code + const candidateCount = rec?.allCandidates.length ?? 0 + const exerciseCount = rec?.totalExercises ?? 0 + const theme = ROLE_THEMES[role.id] ?? DEFAULT_THEME + const topModel = rec?.best[0] + + return ( + +
+ {/* Subtle glow on hover */} +
+ +
+ {/* Header: Icon + role badge */} +
+
+ +
+ {topModel && ( + + Top: {topModel.displayName} + + )} +
+ + {/* Role name + salary */} +

{role.name}

+

+ {role.salaryRange} +

+ + {/* Description */} +

+ {role.description} +

+ + {/* Best for */} +
+

+ Best for +

+
+ {role.bestFor.map((item) => ( + + {item} + + ))} +
+
+ + {/* Strengths & Weaknesses side by side */} +
+ {/* Strengths */} +
+

+ Strengths +

+
    + {role.strengths.map((item) => ( +
  • + + {item} +
  • + ))} +
+
+ + {/* Weaknesses */} +
+

+ Trade-offs +

+
    + {role.weaknesses.map((item) => ( +
  • + + {item} +
  • + ))} +
+
+
+ + {/* Bottom stats + CTA */} +
+
+ + + {candidateCount} candidates + + + + {exerciseCount.toLocaleString()} exercises + +
+ + + View Candidates + + +
+
+
+ + ) + })} + +
+
+ + {/* ── Footer / Methodology Section ───────────────────────────── */} +
+
+ + {/* Stats summary */} + + + {totalEvalRuns.toLocaleString()}+ + {" "} + eval runs + + 5 languages + + Last updated:{" "} + + {lastUpdated + ? new Date(lastUpdated).toLocaleDateString("en-US", { + year: "numeric", + month: "long", + day: "numeric", + }) + : "N/A"} + + + + {/* Assumption note */} + + Daily costs assume ~{TASKS_PER_DAY} tasks per agent per day (~6 productive hours including + overhead). + + + {/* Links */} + + + + Our methodology + + + + Raw eval data + + + + +
+
+ + ) +} diff --git a/apps/web-roo-code/src/lib/mock-recommendations.ts b/apps/web-roo-code/src/lib/mock-recommendations.ts new file mode 100644 index 00000000000..214d12bd27c --- /dev/null +++ b/apps/web-roo-code/src/lib/mock-recommendations.ts @@ -0,0 +1,866 @@ +// --------------------------------------------------------------------------- +// Eval Recommendations: Types + Mock Data (S1.1a) +// --------------------------------------------------------------------------- +// This file defines the API contract for the AI Engineer Talent Marketplace. +// The backend (Sprint 3-4) will produce data matching these exact types. +// --------------------------------------------------------------------------- + +// ── Constants ────────────────────────────────────────────────────────────── + +/** + * Estimated tasks per agent per day. + * Assumes ~6 productive hours with overhead for setup, review, and iteration. + * One human engineer typically manages 2-3 agents throughout a workday. + */ +export const TASKS_PER_DAY = 80 + +// ── Types ────────────────────────────────────────────────────────────────── + +/** Engineer role definition: maps task complexity to a hiring tier. */ +export type EngineerRole = { + id: string + name: string + /** Daily salary range string, e.g. "~$3–38/day" */ + salaryRange: string + description: string + bestFor: string[] + strengths: string[] + weaknesses: string[] + icon: string +} + +/** Language-specific eval scores (0–100). */ +export type LanguageScores = { + go: number + java: number + javascript: number + python: number + rust: number +} + +/** Model inference settings used during evaluation. */ +export type ModelSettings = { + temperature: number + reasoningEffort?: string +} + +/** A model candidate evaluated for a specific role. */ +export type ModelCandidate = { + provider: string + modelId: string + displayName: string + compositeScore: number + tier: "best" | "recommended" | "situational" | "not-recommended" + tags: string[] + successRate: number + avgCostPerTask: number + /** Estimated daily cost: avgCostPerTask × TASKS_PER_DAY */ + estimatedDailyCost: number + avgTimePerTask: number + languageScores: LanguageScores + settings: ModelSettings + caveats?: string[] +} + +/** Full recommendation payload for a single role. */ +export type RoleRecommendation = { + roleId: string + role: EngineerRole + lastUpdated: string + totalEvalRuns: number + totalExercises: number + best: ModelCandidate[] + budgetHire: ModelCandidate | null + speedHire: ModelCandidate | null + allCandidates: ModelCandidate[] +} + +// ── Engineer Role Configs ────────────────────────────────────────────────── + +const ENGINEER_ROLES: EngineerRole[] = [ + { + id: "junior", + name: "Junior Engineer", + salaryRange: "~$2–10/day", + description: + "Handles well-scoped, single-file tasks: boilerplate, simple bug fixes, and test generation at the lowest cost per task.", + bestFor: ["Single-file fixes", "Boilerplate generation", "Test generation", "Simple implementations"], + strengths: ["Cheap", "High throughput", "Best cost-to-quality ratio on simple tasks"], + weaknesses: ["Struggles with multi-file changes", "Limited reasoning depth", "May miss edge cases"], + icon: "Code", + }, + { + id: "senior", + name: "Senior Engineer", + salaryRange: "~$10–26/day", + description: + "The sweet spot for most engineering work. Senior-tier models balance cost and quality across multi-file refactors, feature development, and debugging.", + bestFor: ["Multi-file refactors", "Feature development", "Debugging", "Code review"], + strengths: [ + "Balanced cost/quality", + "Handles multi-file changes and cross-cutting refactors", + "Consistent pass rates across all five languages", + ], + weaknesses: ["More expensive than junior", "Overkill for trivial tasks"], + icon: "GitBranch", + }, + { + id: "staff", + name: "Staff Engineer", + salaryRange: "~$8–34/day", + description: + "For architecture decisions, system design, and complex refactors. Staff-tier models handle ambiguous requirements and cross-cutting changes where other tiers fail.", + bestFor: ["Architecture decisions", "Complex features", "System design", "Ambiguous requirements"], + strengths: [ + "Handles multi-step reasoning and ambiguous specs", + "Passes existing test suites consistently", + "Resolves underspecified requirements", + ], + weaknesses: ["Most expensive", "Overkill for simple tasks", "Diminishing returns on easy work"], + icon: "Building2", + }, + { + id: "reviewer", + name: "Architecture Reviewer", + salaryRange: "~$15–40/day", + description: + "For code review, PR feedback, security analysis, and design critique. Reviewer-tier models catch issues other models miss and provide actionable, context-aware suggestions.", + bestFor: ["Code review", "PR feedback", "Security analysis", "Design critique", "Refactor guidance"], + strengths: [ + "Catches subtle bugs and logic errors", + "Provides actionable suggestions with context", + "Understands cross-file impact of changes", + ], + weaknesses: [ + "Not for writing code from scratch", + "More expensive than running linters", + "Review quality varies by codebase size", + ], + icon: "Search", + }, + { + id: "autonomous", + name: "Autonomous Agent", + salaryRange: "~$5–30/day", + description: + "For issue-to-PR workflows, long-running tasks, and multi-step debugging with minimal supervision. Autonomous-tier models complete tasks end-to-end and recover from errors without human intervention.", + bestFor: [ + "Issue-to-PR workflows", + "Multi-step debugging", + "Feature implementation from spec", + "Long-running tasks", + "Batch operations", + ], + strengths: [ + "Completes tasks end-to-end with minimal guidance", + "Recovers from errors and retries automatically", + "Handles ambiguous requirements independently", + ], + weaknesses: [ + "Higher cost per completed task due to retries", + "May take unexpected approaches without oversight", + "Results need review before merging", + ], + icon: "Bot", + }, +] + +// ── Model Candidates (derived from roocode.com/evals data) ───────────────── +// Cost per task = total run cost ÷ 120 exercises +// Time per task = total duration (seconds) ÷ 120 exercises +// Composite scores computed using role-specific weights: +// Junior: success 50%, speed 20%, cost 25%, quality 5% +// Senior: success 40%, quality 25%, cost 20%, speed 15% +// Staff: success 40%, quality 35%, cost 15%, speed 10% +// Quality = consistency across languages (lower variance → higher score) + +// --- Junior Role Candidates ------------------------------------------------- + +const juniorCandidates: ModelCandidate[] = [ + { + provider: "xai", + modelId: "grok-4-fast", + displayName: "Grok 4 Fast", + compositeScore: 94, + tier: "best", + tags: ["best-value"], + successRate: 97, + avgCostPerTask: 0.029, + estimatedDailyCost: 0.029 * TASKS_PER_DAY, + avgTimePerTask: 144.0, + languageScores: { go: 97, java: 96, javascript: 98, python: 100, rust: 97 }, + settings: { temperature: 0 }, + }, + { + provider: "openai", + modelId: "gpt-5-mini", + displayName: "GPT-5 Mini", + compositeScore: 92, + tier: "best", + tags: [], + successRate: 99, + avgCostPerTask: 0.028, + estimatedDailyCost: 0.028 * TASKS_PER_DAY, + avgTimePerTask: 173.0, + languageScores: { go: 100, java: 98, javascript: 100, python: 100, rust: 97 }, + settings: { temperature: 0 }, + }, + { + provider: "xai", + modelId: "grok-code-fast-1", + displayName: "Grok Code Fast 1", + compositeScore: 85, + tier: "best", + tags: [], + successRate: 90, + avgCostPerTask: 0.057, + estimatedDailyCost: 0.057 * TASKS_PER_DAY, + avgTimePerTask: 146.0, + languageScores: { go: 92, java: 91, javascript: 88, python: 94, rust: 83 }, + settings: { temperature: 0 }, + caveats: ["Weaker on Rust (83%): consider alternatives for Rust-heavy tasks"], + }, + { + provider: "google", + modelId: "gemini-2.5-flash", + displayName: "Gemini 2.5 Flash", + compositeScore: 82, + tier: "recommended", + tags: ["speed-hire"], + successRate: 90, + avgCostPerTask: 0.118, + estimatedDailyCost: 0.118 * TASKS_PER_DAY, + avgTimePerTask: 109.5, + languageScores: { go: 89, java: 91, javascript: 92, python: 85, rust: 90 }, + settings: { temperature: 0 }, + }, + { + provider: "openai", + modelId: "gpt-4.1-mini", + displayName: "GPT-4.1 Mini", + compositeScore: 77, + tier: "recommended", + tags: [], + successRate: 83, + avgCostPerTask: 0.073, + estimatedDailyCost: 0.073 * TASKS_PER_DAY, + avgTimePerTask: 158.5, + languageScores: { go: 81, java: 84, javascript: 94, python: 76, rust: 70 }, + settings: { temperature: 0 }, + caveats: ["Inconsistent across languages: Python (76%) to JavaScript (94%)"], + }, + { + provider: "anthropic", + modelId: "claude-haiku-4-5", + displayName: "Claude Haiku 4.5", + compositeScore: 77, + tier: "recommended", + tags: [], + successRate: 95, + avgCostPerTask: 0.159, + estimatedDailyCost: 0.159 * TASKS_PER_DAY, + avgTimePerTask: 139.0, + languageScores: { go: 92, java: 93, javascript: 94, python: 97, rust: 100 }, + settings: { temperature: 0 }, + caveats: ["Most expensive in junior tier. Consider Grok 4 Fast for better cost-to-quality ratio."], + }, + { + provider: "openai", + modelId: "gpt-5-nano", + displayName: "GPT-5 Nano", + compositeScore: 73, + tier: "situational", + tags: ["budget-hire"], + successRate: 78, + avgCostPerTask: 0.013, + estimatedDailyCost: 0.013 * TASKS_PER_DAY, + avgTimePerTask: 276.5, + languageScores: { go: 86, java: 73, javascript: 76, python: 79, rust: 77 }, + settings: { temperature: 0 }, + caveats: ["Cheapest option but slowest: 4.6 min/task average"], + }, + { + provider: "deepseek", + modelId: "deepseek-v3", + displayName: "DeepSeek V3", + compositeScore: 66, + tier: "situational", + tags: [], + successRate: 77, + avgCostPerTask: 0.107, + estimatedDailyCost: 0.107 * TASKS_PER_DAY, + avgTimePerTask: 216.0, + languageScores: { go: 83, java: 76, javascript: 82, python: 76, rust: 67 }, + settings: { temperature: 0 }, + caveats: ["Weakest on Rust (67%)", "Open-source model, self-hostable"], + }, +] + +// --- Senior Role Candidates ------------------------------------------------- + +const seniorCandidates: ModelCandidate[] = [ + { + provider: "moonshot", + modelId: "kimi-k2-0905", + displayName: "Kimi K2 0905", + compositeScore: 95, + tier: "best", + tags: ["budget-hire", "best-value"], + successRate: 94, + avgCostPerTask: 0.127, + estimatedDailyCost: 0.127 * TASKS_PER_DAY, + avgTimePerTask: 112.0, + languageScores: { go: 94, java: 91, javascript: 96, python: 97, rust: 93 }, + settings: { temperature: 0 }, + caveats: ["Tested via Groq; latency may vary by provider"], + }, + { + provider: "openai", + modelId: "gpt-4.1", + displayName: "GPT-4.1", + compositeScore: 87, + tier: "best", + tags: [], + successRate: 91, + avgCostPerTask: 0.322, + estimatedDailyCost: 0.322 * TASKS_PER_DAY, + avgTimePerTask: 139.5, + languageScores: { go: 92, java: 91, javascript: 90, python: 94, rust: 90 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-sonnet-4", + displayName: "Claude Sonnet 4", + compositeScore: 84, + tier: "best", + tags: ["top-performer"], + successRate: 98, + avgCostPerTask: 0.33, + estimatedDailyCost: 0.33 * TASKS_PER_DAY, + avgTimePerTask: 167.5, + languageScores: { go: 94, java: 100, javascript: 98, python: 100, rust: 97 }, + settings: { temperature: 0 }, + }, + { + provider: "openai", + modelId: "gpt-5-medium", + displayName: "GPT-5 (Medium)", + compositeScore: 81, + tier: "recommended", + tags: [], + successRate: 98, + avgCostPerTask: 0.193, + estimatedDailyCost: 0.193 * TASKS_PER_DAY, + avgTimePerTask: 260.0, + languageScores: { go: 97, java: 98, javascript: 100, python: 100, rust: 93 }, + settings: { temperature: 0, reasoningEffort: "medium" }, + caveats: ["Slowest in tier: 4.3 min/task average"], + }, + { + provider: "anthropic", + modelId: "claude-3.7-sonnet", + displayName: "Claude 3.7 Sonnet", + compositeScore: 79, + tier: "recommended", + tags: [], + successRate: 95, + avgCostPerTask: 0.313, + estimatedDailyCost: 0.313 * TASKS_PER_DAY, + avgTimePerTask: 176.5, + languageScores: { go: 92, java: 98, javascript: 94, python: 100, rust: 93 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-3.5-sonnet", + displayName: "Claude 3.5 Sonnet", + compositeScore: 78, + tier: "recommended", + tags: ["speed-hire"], + successRate: 90, + avgCostPerTask: 0.208, + estimatedDailyCost: 0.208 * TASKS_PER_DAY, + avgTimePerTask: 108.5, + languageScores: { go: 94, java: 91, javascript: 92, python: 88, rust: 80 }, + settings: { temperature: 0 }, + caveats: ["Previous generation; weaker on Rust (80%)"], + }, + { + provider: "openai", + modelId: "gpt-5-low", + displayName: "GPT-5 (Low)", + compositeScore: 76, + tier: "situational", + tags: [], + successRate: 95, + avgCostPerTask: 0.135, + estimatedDailyCost: 0.135 * TASKS_PER_DAY, + avgTimePerTask: 175.0, + languageScores: { go: 100, java: 96, javascript: 86, python: 100, rust: 100 }, + settings: { temperature: 0, reasoningEffort: "low" }, + caveats: ["Weak on JavaScript (86%) compared to other languages"], + }, + { + provider: "google", + modelId: "gemini-2.5-pro", + displayName: "Gemini 2.5 Pro", + compositeScore: 73, + tier: "situational", + tags: [], + successRate: 96, + avgCostPerTask: 0.482, + estimatedDailyCost: 0.482 * TASKS_PER_DAY, + avgTimePerTask: 188.5, + languageScores: { go: 97, java: 91, javascript: 96, python: 100, rust: 97 }, + settings: { temperature: 0 }, + caveats: ["Most expensive in this tier: ~$39/day ($0.48/task)"], + }, +] + +// --- Staff Role Candidates -------------------------------------------------- + +const staffCandidates: ModelCandidate[] = [ + { + provider: "openai", + modelId: "gpt-5.2-med", + displayName: "GPT 5.2 (Med)", + compositeScore: 99, + tier: "best", + tags: ["budget-hire", "best-value"], + successRate: 100, + avgCostPerTask: 0.104, + estimatedDailyCost: 0.104 * TASKS_PER_DAY, + avgTimePerTask: 105.5, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0, reasoningEffort: "medium" }, + caveats: ["100% pass rate at ~$8/day ($0.10/task): best cost-to-quality ratio in this role"], + }, + { + provider: "anthropic", + modelId: "claude-opus-4-6", + displayName: "Claude Opus 4.6", + compositeScore: 98, + tier: "best", + tags: ["speed-hire", "top-performer"], + successRate: 100, + avgCostPerTask: 0.412, + estimatedDailyCost: 0.412 * TASKS_PER_DAY, + avgTimePerTask: 76.5, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-sonnet-4-5", + displayName: "Claude Sonnet 4.5", + compositeScore: 97, + tier: "best", + tags: [], + successRate: 100, + avgCostPerTask: 0.32, + estimatedDailyCost: 0.32 * TASKS_PER_DAY, + avgTimePerTask: 103.0, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-opus-4-5", + displayName: "Claude Opus 4.5", + compositeScore: 96, + tier: "recommended", + tags: [], + successRate: 100, + avgCostPerTask: 0.419, + estimatedDailyCost: 0.419 * TASKS_PER_DAY, + avgTimePerTask: 124.0, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0 }, + }, + { + provider: "google", + modelId: "gemini-3-pro-preview", + displayName: "Gemini 3 Pro Preview", + compositeScore: 95, + tier: "recommended", + tags: [], + successRate: 100, + avgCostPerTask: 0.276, + estimatedDailyCost: 0.276 * TASKS_PER_DAY, + avgTimePerTask: 164.0, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-opus-4-1", + displayName: "Claude Opus 4.1", + compositeScore: 73, + tier: "situational", + tags: [], + successRate: 98, + avgCostPerTask: 1.168, + estimatedDailyCost: 1.168 * TASKS_PER_DAY, + avgTimePerTask: 211.5, + languageScores: { go: 97, java: 96, javascript: 98, python: 100, rust: 100 }, + settings: { temperature: 0 }, + caveats: ["~$93/day ($1.17/task), 11× the cost of the top pick"], + }, + { + provider: "openai", + modelId: "gpt-5-medium", + displayName: "GPT-5 (Medium)", + compositeScore: 71, + tier: "situational", + tags: [], + successRate: 98, + avgCostPerTask: 0.193, + estimatedDailyCost: 0.193 * TASKS_PER_DAY, + avgTimePerTask: 260.0, + languageScores: { go: 97, java: 98, javascript: 100, python: 100, rust: 93 }, + settings: { temperature: 0, reasoningEffort: "medium" }, + caveats: ["Slowest in tier: 4.3 min/task average"], + }, + { + provider: "anthropic", + modelId: "claude-opus-4", + displayName: "Claude Opus 4", + compositeScore: 57, + tier: "not-recommended", + tags: [], + successRate: 94, + avgCostPerTask: 1.436, + estimatedDailyCost: 1.436 * TASKS_PER_DAY, + avgTimePerTask: 235.0, + languageScores: { go: 92, java: 91, javascript: 94, python: 94, rust: 100 }, + settings: { temperature: 0 }, + caveats: [ + "Most expensive model tested: ~$115/day ($1.44/task)", + "Lower success rate (94%) despite highest cost", + ], + }, +] + +// --- Architecture Reviewer Candidates --------------------------------------- +// Composite scoring: quality 50%, success 30%, cost 15%, speed 5% +// Quality = consistency across languages (lower variance → higher score) + +const reviewerCandidates: ModelCandidate[] = [ + { + provider: "openai", + modelId: "gpt-5.2-med", + displayName: "GPT 5.2 (Med)", + compositeScore: 98, + tier: "best", + tags: ["budget-hire", "best-value"], + successRate: 100, + avgCostPerTask: 0.104, + estimatedDailyCost: 0.104 * TASKS_PER_DAY, + avgTimePerTask: 105.5, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0, reasoningEffort: "medium" }, + caveats: ["100% consistency across all languages: ideal reviewer at ~$8/day"], + }, + { + provider: "anthropic", + modelId: "claude-opus-4-6", + displayName: "Claude Opus 4.6", + compositeScore: 95, + tier: "best", + tags: ["speed-hire", "top-performer"], + successRate: 100, + avgCostPerTask: 0.412, + estimatedDailyCost: 0.412 * TASKS_PER_DAY, + avgTimePerTask: 76.5, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-sonnet-4-5", + displayName: "Claude Sonnet 4.5", + compositeScore: 94, + tier: "best", + tags: [], + successRate: 100, + avgCostPerTask: 0.32, + estimatedDailyCost: 0.32 * TASKS_PER_DAY, + avgTimePerTask: 103.0, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-sonnet-4", + displayName: "Claude Sonnet 4", + compositeScore: 90, + tier: "recommended", + tags: ["top-performer"], + successRate: 98, + avgCostPerTask: 0.33, + estimatedDailyCost: 0.33 * TASKS_PER_DAY, + avgTimePerTask: 167.5, + languageScores: { go: 94, java: 100, javascript: 98, python: 100, rust: 97 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-haiku-4-5", + displayName: "Claude Haiku 4.5", + compositeScore: 88, + tier: "recommended", + tags: [], + successRate: 95, + avgCostPerTask: 0.159, + estimatedDailyCost: 0.159 * TASKS_PER_DAY, + avgTimePerTask: 139.0, + languageScores: { go: 92, java: 93, javascript: 94, python: 97, rust: 100 }, + settings: { temperature: 0 }, + caveats: ["Budget reviewer option: good consistency at lower cost"], + }, + { + provider: "anthropic", + modelId: "claude-3.7-sonnet", + displayName: "Claude 3.7 Sonnet", + compositeScore: 86, + tier: "recommended", + tags: [], + successRate: 95, + avgCostPerTask: 0.313, + estimatedDailyCost: 0.313 * TASKS_PER_DAY, + avgTimePerTask: 176.5, + languageScores: { go: 92, java: 98, javascript: 94, python: 100, rust: 93 }, + settings: { temperature: 0 }, + }, + { + provider: "google", + modelId: "gemini-2.5-pro", + displayName: "Gemini 2.5 Pro", + compositeScore: 82, + tier: "situational", + tags: [], + successRate: 96, + avgCostPerTask: 0.482, + estimatedDailyCost: 0.482 * TASKS_PER_DAY, + avgTimePerTask: 188.5, + languageScores: { go: 97, java: 91, javascript: 96, python: 100, rust: 97 }, + settings: { temperature: 0 }, + caveats: ["Most expensive reviewer: ~$39/day ($0.48/task)", "More variable across languages than top picks"], + }, + { + provider: "openai", + modelId: "gpt-4.1", + displayName: "GPT-4.1", + compositeScore: 80, + tier: "situational", + tags: [], + successRate: 91, + avgCostPerTask: 0.322, + estimatedDailyCost: 0.322 * TASKS_PER_DAY, + avgTimePerTask: 139.5, + languageScores: { go: 92, java: 91, javascript: 90, python: 94, rust: 90 }, + settings: { temperature: 0 }, + caveats: ["Lower consistency across languages than Anthropic alternatives"], + }, +] + +// --- Autonomous Agent Candidates -------------------------------------------- +// Composite scoring: success 35%, quality 35%, cost 20%, speed 10% +// Focused on end-to-end task completion and error recovery + +const autonomousCandidates: ModelCandidate[] = [ + { + provider: "openai", + modelId: "gpt-5.2-med", + displayName: "GPT 5.2 (Med)", + compositeScore: 97, + tier: "best", + tags: ["best-value", "speed-hire"], + successRate: 100, + avgCostPerTask: 0.104, + estimatedDailyCost: 0.104 * TASKS_PER_DAY, + avgTimePerTask: 105.5, + languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, + settings: { temperature: 0, reasoningEffort: "medium" }, + caveats: ["Perfect success rate + fast completion: ideal autonomous agent at ~$8/day"], + }, + { + provider: "openai", + modelId: "gpt-5-mini", + displayName: "GPT-5 Mini", + compositeScore: 93, + tier: "best", + tags: ["budget-hire"], + successRate: 99, + avgCostPerTask: 0.028, + estimatedDailyCost: 0.028 * TASKS_PER_DAY, + avgTimePerTask: 173.0, + languageScores: { go: 100, java: 98, javascript: 100, python: 100, rust: 97 }, + settings: { temperature: 0 }, + caveats: ["Cheapest autonomous option at ~$2/day with near-perfect success"], + }, + { + provider: "xai", + modelId: "grok-4-fast", + displayName: "Grok 4 Fast", + compositeScore: 92, + tier: "best", + tags: [], + successRate: 97, + avgCostPerTask: 0.029, + estimatedDailyCost: 0.029 * TASKS_PER_DAY, + avgTimePerTask: 144.0, + languageScores: { go: 97, java: 96, javascript: 98, python: 100, rust: 97 }, + settings: { temperature: 0 }, + }, + { + provider: "anthropic", + modelId: "claude-sonnet-4", + displayName: "Claude Sonnet 4", + compositeScore: 87, + tier: "recommended", + tags: ["top-performer"], + successRate: 98, + avgCostPerTask: 0.33, + estimatedDailyCost: 0.33 * TASKS_PER_DAY, + avgTimePerTask: 167.5, + languageScores: { go: 94, java: 100, javascript: 98, python: 100, rust: 97 }, + settings: { temperature: 0 }, + }, + { + provider: "moonshot", + modelId: "kimi-k2-0905", + displayName: "Kimi K2 0905", + compositeScore: 86, + tier: "recommended", + tags: [], + successRate: 94, + avgCostPerTask: 0.127, + estimatedDailyCost: 0.127 * TASKS_PER_DAY, + avgTimePerTask: 112.0, + languageScores: { go: 94, java: 91, javascript: 96, python: 97, rust: 93 }, + settings: { temperature: 0 }, + caveats: ["Tested via Groq; latency may vary by provider"], + }, + { + provider: "anthropic", + modelId: "claude-haiku-4-5", + displayName: "Claude Haiku 4.5", + compositeScore: 85, + tier: "recommended", + tags: [], + successRate: 95, + avgCostPerTask: 0.159, + estimatedDailyCost: 0.159 * TASKS_PER_DAY, + avgTimePerTask: 139.0, + languageScores: { go: 92, java: 93, javascript: 94, python: 97, rust: 100 }, + settings: { temperature: 0 }, + }, + { + provider: "openai", + modelId: "gpt-5-low", + displayName: "GPT-5 (Low)", + compositeScore: 82, + tier: "situational", + tags: [], + successRate: 95, + avgCostPerTask: 0.135, + estimatedDailyCost: 0.135 * TASKS_PER_DAY, + avgTimePerTask: 175.0, + languageScores: { go: 100, java: 96, javascript: 86, python: 100, rust: 100 }, + settings: { temperature: 0, reasoningEffort: "low" }, + caveats: ["Weak on JavaScript (86%) compared to other languages"], + }, + { + provider: "openai", + modelId: "gpt-5-medium", + displayName: "GPT-5 (Medium)", + compositeScore: 80, + tier: "situational", + tags: [], + successRate: 98, + avgCostPerTask: 0.193, + estimatedDailyCost: 0.193 * TASKS_PER_DAY, + avgTimePerTask: 260.0, + languageScores: { go: 97, java: 98, javascript: 100, python: 100, rust: 93 }, + settings: { temperature: 0, reasoningEffort: "medium" }, + caveats: ["Slowest in tier: 4.3 min/task average"], + }, +] + +// ── Recommendation Builders ──────────────────────────────────────────────── + +function findBudgetHire(candidates: ModelCandidate[]): ModelCandidate | null { + const budget = candidates + .filter((c) => c.tags.includes("budget-hire")) + .sort((a, b) => a.avgCostPerTask - b.avgCostPerTask) + return budget[0] ?? null +} + +function findSpeedHire(candidates: ModelCandidate[]): ModelCandidate | null { + const fast = [...candidates] + .filter((c) => c.tier !== "not-recommended") + .sort((a, b) => a.avgTimePerTask - b.avgTimePerTask) + return fast[0] ?? null +} + +function buildRecommendation( + role: EngineerRole, + candidates: ModelCandidate[], + totalEvalRuns: number, + totalExercises: number, +): RoleRecommendation { + const sorted = [...candidates].sort((a, b) => b.compositeScore - a.compositeScore) + return { + roleId: role.id, + role, + lastUpdated: "2026-02-11T00:00:00Z", + totalEvalRuns, + totalExercises, + best: sorted.filter((c) => c.tier === "best").slice(0, 3), + budgetHire: findBudgetHire(sorted), + speedHire: findSpeedHire(sorted), + allCandidates: sorted, + } +} + +// ── Pre-built Recommendations ────────────────────────────────────────────── + +const RECOMMENDATIONS: Record = { + junior: buildRecommendation(ENGINEER_ROLES[0]!, juniorCandidates, 27, 120), + senior: buildRecommendation(ENGINEER_ROLES[1]!, seniorCandidates, 27, 120), + staff: buildRecommendation(ENGINEER_ROLES[2]!, staffCandidates, 27, 120), + reviewer: buildRecommendation(ENGINEER_ROLES[3]!, reviewerCandidates, 27, 120), + autonomous: buildRecommendation(ENGINEER_ROLES[4]!, autonomousCandidates, 27, 120), +} + +// ── Public API ───────────────────────────────────────────────────────────── + +/** Returns all engineer role configurations. */ +export function getEngineerRoles(): EngineerRole[] { + return ENGINEER_ROLES +} + +/** Returns a single engineer role by id, or `undefined` if not found. */ +export function getEngineerRole(roleId: string): EngineerRole | undefined { + return ENGINEER_ROLES.find((r) => r.id === roleId) +} + +/** Returns the full recommendation payload for a role, or `undefined` if not found. */ +export function getRoleRecommendation(roleId: string): RoleRecommendation | undefined { + return RECOMMENDATIONS[roleId] +} + +/** Returns recommendation payloads for all roles. */ +export function getAllRecommendations(): RoleRecommendation[] { + return Object.values(RECOMMENDATIONS) +} + +/** Generates a Cloud signup URL pre-configured with the candidate's model settings. */ +export function getCloudSetupUrl(candidate: ModelCandidate): string { + const params = new URLSearchParams({ + redirect_url: `/cloud-agents/setup?model=${candidate.modelId}&provider=${candidate.provider}&temperature=${candidate.settings.temperature}`, + }) + return `https://app.roocode.com/sign-up?${params.toString()}` +} From baf0d5c3b7387f53b3a071fec63fdde7fec412b0 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Wed, 11 Feb 2026 19:39:45 -0800 Subject: [PATCH 04/22] feat(web-evals): redesign comparison page with glass-morphism and role themes - Atmospheric header with role-colored blur gradients - Glass-morphism containers for chart, filters, and export - Styled language toggle pills with role color accents - Themed provider checkboxes and success rate slider - Custom chart tooltip with backdrop blur - Export buttons with press feedback - framer-motion scroll-triggered animations - Bottom navigation with pill-style links - Role themes: reviewer (violet) and autonomous (cyan) added to candidates page --- .../[roleId]/compare/comparison-chart.tsx | 817 +++++++++++++----- 1 file changed, 587 insertions(+), 230 deletions(-) diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx index 4b87bbb1cf9..7029c900102 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -2,12 +2,188 @@ import { useState, useMemo, useCallback } from "react" import Link from "next/link" -import { ArrowLeft, Copy, Check, FileJson, FileSpreadsheet } from "lucide-react" +import { motion } from "framer-motion" +import { + ArrowLeft, + ArrowRight, + Copy, + Check, + FileJson, + FileSpreadsheet, + BarChart3, + SlidersHorizontal, + Download, + FlaskConical, +} from "lucide-react" import { BarChart, Bar, XAxis, YAxis, Tooltip, ResponsiveContainer, Legend } from "recharts" import type { ModelCandidate, LanguageScores, EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations" import { TASKS_PER_DAY } from "@/lib/mock-recommendations" +// ── Role Color Themes (matching candidates-content.tsx) ───────────────────── + +type RoleTheme = { + accent: string + accentLight: string + accentDark: string + iconBg: string + iconText: string + buttonBg: string + buttonHover: string + glowColor: string + blurBg1: string + blurBg2: string + borderHover: string + shadowHover: string + methodologyBorder: string + scoreText: string + pillActive: string + pillActiveBg: string + checkboxAccent: string + sliderAccent: string +} + +const ROLE_THEMES: Record = { + junior: { + accent: "emerald", + accentLight: "text-emerald-600", + accentDark: "dark:text-emerald-400", + iconBg: "bg-emerald-100 dark:bg-emerald-900/30", + iconText: "text-emerald-700 dark:text-emerald-300", + buttonBg: "bg-emerald-600 dark:bg-emerald-600", + buttonHover: "hover:bg-emerald-700 dark:hover:bg-emerald-500", + glowColor: "bg-emerald-500/8 dark:bg-emerald-600/15", + blurBg1: "bg-emerald-500/10 dark:bg-emerald-600/20", + blurBg2: "bg-emerald-400/5 dark:bg-emerald-500/10", + borderHover: "hover:border-emerald-500/40 dark:hover:border-emerald-400/30", + shadowHover: "hover:shadow-emerald-500/10 dark:hover:shadow-emerald-400/10", + methodologyBorder: "border-emerald-500/30 hover:border-emerald-500/50", + scoreText: "text-emerald-400", + pillActive: "bg-emerald-600 text-white shadow-lg shadow-emerald-600/25", + pillActiveBg: "bg-emerald-600", + checkboxAccent: "accent-emerald-600", + sliderAccent: "accent-emerald-600", + }, + senior: { + accent: "blue", + accentLight: "text-blue-600", + accentDark: "dark:text-blue-400", + iconBg: "bg-blue-100 dark:bg-blue-900/30", + iconText: "text-blue-700 dark:text-blue-300", + buttonBg: "bg-blue-600 dark:bg-blue-600", + buttonHover: "hover:bg-blue-700 dark:hover:bg-blue-500", + glowColor: "bg-blue-500/8 dark:bg-blue-600/15", + blurBg1: "bg-blue-500/10 dark:bg-blue-600/20", + blurBg2: "bg-blue-400/5 dark:bg-blue-500/10", + borderHover: "hover:border-blue-500/40 dark:hover:border-blue-400/30", + shadowHover: "hover:shadow-blue-500/10 dark:hover:shadow-blue-400/10", + methodologyBorder: "border-blue-500/30 hover:border-blue-500/50", + scoreText: "text-blue-400", + pillActive: "bg-blue-600 text-white shadow-lg shadow-blue-600/25", + pillActiveBg: "bg-blue-600", + checkboxAccent: "accent-blue-600", + sliderAccent: "accent-blue-600", + }, + staff: { + accent: "amber", + accentLight: "text-amber-600", + accentDark: "dark:text-amber-400", + iconBg: "bg-amber-100 dark:bg-amber-900/30", + iconText: "text-amber-700 dark:text-amber-300", + buttonBg: "bg-amber-600 dark:bg-amber-600", + buttonHover: "hover:bg-amber-700 dark:hover:bg-amber-500", + glowColor: "bg-amber-500/8 dark:bg-amber-600/15", + blurBg1: "bg-amber-500/10 dark:bg-amber-600/20", + blurBg2: "bg-amber-400/5 dark:bg-amber-500/10", + borderHover: "hover:border-amber-500/40 dark:hover:border-amber-400/30", + shadowHover: "hover:shadow-amber-500/10 dark:hover:shadow-amber-400/10", + methodologyBorder: "border-amber-500/30 hover:border-amber-500/50", + scoreText: "text-amber-400", + pillActive: "bg-amber-600 text-white shadow-lg shadow-amber-600/25", + pillActiveBg: "bg-amber-600", + checkboxAccent: "accent-amber-600", + sliderAccent: "accent-amber-600", + }, + reviewer: { + accent: "violet", + accentLight: "text-violet-600", + accentDark: "dark:text-violet-400", + iconBg: "bg-violet-100 dark:bg-violet-900/30", + iconText: "text-violet-700 dark:text-violet-300", + buttonBg: "bg-violet-600 dark:bg-violet-600", + buttonHover: "hover:bg-violet-700 dark:hover:bg-violet-500", + glowColor: "bg-violet-500/8 dark:bg-violet-600/15", + blurBg1: "bg-violet-500/10 dark:bg-violet-600/20", + blurBg2: "bg-violet-400/5 dark:bg-violet-500/10", + borderHover: "hover:border-violet-500/40 dark:hover:border-violet-400/30", + shadowHover: "hover:shadow-violet-500/10 dark:hover:shadow-violet-400/10", + methodologyBorder: "border-violet-500/30 hover:border-violet-500/50", + scoreText: "text-violet-400", + pillActive: "bg-violet-600 text-white shadow-lg shadow-violet-600/25", + pillActiveBg: "bg-violet-600", + checkboxAccent: "accent-violet-600", + sliderAccent: "accent-violet-600", + }, + autonomous: { + accent: "cyan", + accentLight: "text-cyan-600", + accentDark: "dark:text-cyan-400", + iconBg: "bg-cyan-100 dark:bg-cyan-900/30", + iconText: "text-cyan-700 dark:text-cyan-300", + buttonBg: "bg-cyan-600 dark:bg-cyan-600", + buttonHover: "hover:bg-cyan-700 dark:hover:bg-cyan-500", + glowColor: "bg-cyan-500/8 dark:bg-cyan-600/15", + blurBg1: "bg-cyan-500/10 dark:bg-cyan-600/20", + blurBg2: "bg-cyan-400/5 dark:bg-cyan-500/10", + borderHover: "hover:border-cyan-500/40 dark:hover:border-cyan-400/30", + shadowHover: "hover:shadow-cyan-500/10 dark:hover:shadow-cyan-400/10", + methodologyBorder: "border-cyan-500/30 hover:border-cyan-500/50", + scoreText: "text-cyan-400", + pillActive: "bg-cyan-600 text-white shadow-lg shadow-cyan-600/25", + pillActiveBg: "bg-cyan-600", + checkboxAccent: "accent-cyan-600", + sliderAccent: "accent-cyan-600", + }, +} + +const DEFAULT_THEME = ROLE_THEMES.senior! + +// ── Framer Motion Variants ────────────────────────────────────────────────── + +const containerVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + staggerChildren: 0.12, + delayChildren: 0.1, + }, + }, +} + +const fadeUpVariants = { + hidden: { opacity: 0, y: 20 }, + visible: { + opacity: 1, + y: 0, + transition: { + duration: 0.6, + ease: [0.21, 0.45, 0.27, 0.9] as const, + }, + }, +} + +const backgroundVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { + duration: 1.2, + ease: "easeOut" as const, + }, + }, +} + // ── Constants ─────────────────────────────────────────────────────────────── const LANGUAGES: { key: keyof LanguageScores; label: string }[] = [ @@ -18,7 +194,17 @@ const LANGUAGES: { key: keyof LanguageScores; label: string }[] = [ { key: "rust", label: "Rust" }, ] -const PROVIDERS = ["anthropic", "openai", "google", "deepseek", "groq", "alibaba", "mistral"] as const +const PROVIDERS = [ + "anthropic", + "openai", + "google", + "deepseek", + "groq", + "alibaba", + "mistral", + "xai", + "moonshot", +] as const const PROVIDER_LABELS: Record = { anthropic: "Anthropic", @@ -28,6 +214,8 @@ const PROVIDER_LABELS: Record = { groq: "Meta/Groq", alibaba: "Alibaba", mistral: "Mistral", + xai: "xAI", + moonshot: "Moonshot", } const DIMENSION_COLORS = { @@ -124,29 +312,34 @@ function CustomTooltip({ const costPerTask = rawData?.costPerTask return ( -
-

{label}

- {payload.map( - ( - entry: { - name: string - value: number - color: string - dataKey: string - }, - index: number, - ) => ( -
- - {entry.name}: - - {entry.dataKey === "costEfficiency" && dailyCost !== undefined - ? `${entry.value} (~$${dailyCost}/day · $${costPerTask?.toFixed(3)}/task)` - : entry.value} - -
- ), - )} +
+

{label}

+
+ {payload.map( + ( + entry: { + name: string + value: number + color: string + dataKey: string + }, + index: number, + ) => ( +
+ + {entry.name}: + + {entry.dataKey === "costEfficiency" && dailyCost !== undefined + ? `${entry.value} (~$${dailyCost}/day · $${costPerTask?.toFixed(3)}/task)` + : entry.value} + +
+ ), + )} +
) } @@ -161,6 +354,7 @@ interface ComparisonChartProps { export function ComparisonChart({ recommendation, role, roleId }: ComparisonChartProps) { const { allCandidates } = recommendation + const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME // ── State ─────────────────────────────────────────────────────────────── const [selectedLanguage, setSelectedLanguage] = useState("all") @@ -186,6 +380,12 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar const chartHeight = Math.max(300, chartData.length * 60 + 80) + // Providers that actually appear in data + const activeProviders = useMemo(() => { + const providers = new Set(allCandidates.map((c) => c.provider)) + return PROVIDERS.filter((p) => providers.has(p)) + }, [allCandidates]) + // ── Handlers ──────────────────────────────────────────────────────────── const toggleProvider = useCallback((provider: string) => { @@ -229,215 +429,372 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar // ── Render ────────────────────────────────────────────────────────────── return ( -
- {/* ── Breadcrumb ─────────────────────────────────────────────── */} - - - {/* ── Page Header ────────────────────────────────────────────── */} -
-

Compare Candidates — {role.name}

-

- Interactive comparison across composite score, success rate, cost efficiency, and speed. -

-
- - {/* ── Language Toggle ─────────────────────────────────────────── */} -
-

- Score View -

-
- - {LANGUAGES.map(({ key, label }) => ( - - ))} -
-
- - {/* ── Filters ────────────────────────────────────────────────── */} -
- {/* Provider checkboxes */} -
-

- Providers -

-
- {PROVIDERS.map((p) => ( - - ))} -
-
- - {/* Min success rate slider */} -
-

- Min Success Rate -

-
- setMinSuccessRate(Number(e.target.value))} - className="h-2 flex-1 cursor-pointer accent-blue-600" + <> + {/* ── Atmospheric Header ────────────────────────────────────── */} +
+ {/* Blur gradient background in role color */} + +
+
- {minSuccessRate}% +
+
+ + +
+ + {/* Breadcrumb */} + + + Evals + + / + + Hire an AI Engineer + + / + + {role.name} + + / + Compare Candidates + + + {/* Title row */} + +
+ +
+
+

Compare Candidates

+

+ {role.name} +

+

+ Interactive comparison across composite score, success rate, cost efficiency, and + speed. Filter by provider, language, and minimum success rate. +

+
+
+ + {/* Stats bar */} + + + + + {filteredCandidates.length} + + of {allCandidates.length} candidates shown + +
+ + Viewing{" "} + + {selectedLanguage === "all" + ? "All Languages" + : LANGUAGES.find((l) => l.key === selectedLanguage)?.label} + + + {minSuccessRate > 0 && ( + <> +
+ + Min success{" "} + + {minSuccessRate}% + + + + )} + +
- {/* ── Chart ──────────────────────────────────────────────────── */} -
-

- {selectedLanguage === "all" - ? "Composite Score" - : `${LANGUAGES.find((l) => l.key === selectedLanguage)?.label} Score`}{" "} - Comparison -

-

- Cost Efficiency and Speed are inverted — higher bars mean cheaper / faster. Daily costs assume ~ - {TASKS_PER_DAY} tasks per agent per day (~6 productive hours). -

- - {chartData.length === 0 ? ( -
- No candidates match the current filters. -
- ) : ( - - - `${v}`} /> - - } /> - - l.key === selectedLanguage)?.label ?? "Language"} Score` - } - fill={DIMENSION_COLORS.composite} - radius={[0, 4, 4, 0]} - barSize={12} - /> - - - - - - )} -
- - {/* ── Export Buttons ──────────────────────────────────────────── */} -
- - - -
- - {/* ── Bottom Navigation ───────────────────────────────────────── */} - -
+ {/* ── Main Content ──────────────────────────────────────────── */} +
+ + {/* ── Filters Section ────────────────────────────────────── */} + +
+
+ +
+

+ Filters +

+
+ +
+ {/* Language toggle pills */} +
+

+ Score View +

+
+ + {LANGUAGES.map(({ key, label }) => ( + + ))} +
+
+ + {/* Min success rate slider */} +
+

+ Min Success Rate +

+
+ setMinSuccessRate(Number(e.target.value))} + className={`h-2 flex-1 cursor-pointer appearance-none rounded-full bg-muted/50 ${theme.sliderAccent}`} + /> + + {minSuccessRate}% + +
+
+
+ + {/* Provider checkboxes */} +
+

+ Providers +

+
+ {activeProviders.map((p) => ( + + ))} +
+
+
+ + {/* ── Chart Section ──────────────────────────────────────── */} + +
+

+ {selectedLanguage === "all" + ? "Composite Score" + : `${LANGUAGES.find((l) => l.key === selectedLanguage)?.label} Score`}{" "} + Comparison +

+
+

+ Cost Efficiency and Speed are inverted — higher bars mean cheaper / faster. Daily costs + assume ~{TASKS_PER_DAY} tasks per agent per day (~6 productive hours). +

+ + {chartData.length === 0 ? ( +
+ +

No candidates match the current filters.

+

+ Try adjusting the provider or success rate filters. +

+
+ ) : ( +
+ + + `${v}`} + stroke="hsl(var(--muted-foreground))" + strokeOpacity={0.3} + tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }} + axisLine={false} + /> + + } + cursor={{ fill: "hsl(var(--muted))", fillOpacity: 0.15 }} + /> + + l.key === selectedLanguage)?.label ?? "Language"} Score` + } + fill={DIMENSION_COLORS.composite} + radius={[0, 4, 4, 0]} + barSize={12} + /> + + + + + +
+ )} +
+ + {/* ── Export Section ──────────────────────────────────────── */} + +
+
+ +
+

+ Export Data +

+
+ +
+ + + +
+
+ + {/* ── Bottom Navigation ───────────────────────────────────── */} + +
+
+ + + Back to {role.name} candidates + +
+
+ + + All roles + + + 📋 Raw eval data + + +
+
+
+
+
+ ) } From 336a6726f99118db940192a39d08d432bb09e8a9 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Wed, 11 Feb 2026 19:44:30 -0800 Subject: [PATCH 05/22] fix(web-evals): improve comparison chart bar spacing and height --- .../workers/[roleId]/compare/comparison-chart.tsx | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx index 7029c900102..bc9b9cb8df9 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -378,7 +378,7 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar [filteredCandidates, selectedLanguage, maxCost, maxTime], ) - const chartHeight = Math.max(300, chartData.length * 60 + 80) + const chartHeight = Math.max(400, chartData.length * 100) // Providers that actually appear in data const activeProviders = useMemo(() => { @@ -655,6 +655,8 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar From bf55aa0103044a1536ec27dd8bc312fa1cba3d42 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Wed, 11 Feb 2026 19:53:10 -0800 Subject: [PATCH 06/22] feat(web-evals): add scatter plot charts (value map + capability timeline) - Add "Value Map: Salary vs Interview Score" scatter to comparison page - Dots colored by tier, sized by success rate - Sweet Spot quadrant highlight (upper-left) - Respects existing provider/success-rate filters - Add "AI Coding Capability Over Time" scatter to landing page - 10 models from Jun 2025 to Feb 2026 - Dots colored by provider, sized by cost efficiency - Dashed trend line showing upward trajectory - Add MODEL_TIMELINE data to mock-recommendations.ts --- .../[roleId]/compare/comparison-chart.tsx | 222 ++++++++++++++- .../src/app/evals/workers/workers-content.tsx | 265 +++++++++++++++++- .../src/lib/mock-recommendations.ts | 24 ++ 3 files changed, 509 insertions(+), 2 deletions(-) diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx index bc9b9cb8df9..8e5aa85b7fd 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -15,7 +15,20 @@ import { Download, FlaskConical, } from "lucide-react" -import { BarChart, Bar, XAxis, YAxis, Tooltip, ResponsiveContainer, Legend } from "recharts" +import { + BarChart, + Bar, + XAxis, + YAxis, + Tooltip, + ResponsiveContainer, + Legend, + ScatterChart, + Scatter, + ZAxis, + Cell, + ReferenceArea, +} from "recharts" import type { ModelCandidate, LanguageScores, EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations" import { TASKS_PER_DAY } from "@/lib/mock-recommendations" @@ -225,6 +238,20 @@ const DIMENSION_COLORS = { speed: "#a855f7", // purple } +const TIER_COLORS: Record = { + best: "#22c55e", // green + recommended: "#3b82f6", // blue + situational: "#eab308", // yellow + "not-recommended": "#ef4444", // red +} + +const TIER_LABELS: Record = { + best: "Best", + recommended: "Recommended", + situational: "Situational", + "not-recommended": "Not Recommended", +} + // ── Helpers ───────────────────────────────────────────────────────────────── /** Normalize cost: lower cost → higher bar (0–100). */ @@ -344,6 +371,62 @@ function CustomTooltip({ ) } +// ── Scatter Tooltip ───────────────────────────────────────────────────────── + +function ScatterTooltip({ + active, + payload, +}: { + active?: boolean + // eslint-disable-next-line @typescript-eslint/no-explicit-any + payload?: any[] +}) { + if (!active || !payload || !payload.length) return null + + const data = payload[0]?.payload as + | { + name?: string + dailyCost?: number + score?: number + successRate?: number + tier?: string + } + | undefined + + if (!data) return null + + return ( +
+

{data.name}

+
+
+ + Tier: + {TIER_LABELS[data.tier ?? "situational"]} +
+
+ + Daily Salary: + ${data.dailyCost}/day +
+
+ + Interview Score: + {data.score} +
+
+ + Success Rate: + {data.successRate}% +
+
+
+ ) +} + // ── Main Component ────────────────────────────────────────────────────────── interface ComparisonChartProps { @@ -380,6 +463,25 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar const chartHeight = Math.max(400, chartData.length * 100) + // Scatter plot data: value map of daily cost vs composite score + const scatterData = useMemo( + () => + filteredCandidates.map((c) => ({ + name: c.displayName, + dailyCost: Math.round(c.estimatedDailyCost), + score: c.compositeScore, + successRate: c.successRate, + tier: c.tier, + // ZAxis size: map success rate to dot size (60–400 range) + dotSize: Math.round(60 + (c.successRate / 100) * 340), + })), + [filteredCandidates], + ) + + // Determine axis domains for scatter plot + const scatterMaxCost = useMemo(() => Math.max(...scatterData.map((d) => d.dailyCost), 10), [scatterData]) + const scatterMinScore = useMemo(() => Math.min(...scatterData.map((d) => d.score), 50), [scatterData]) + // Providers that actually appear in data const activeProviders = useMemo(() => { const providers = new Set(allCandidates.map((c) => c.provider)) @@ -624,6 +726,124 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar
+ {/* ── Value Map Scatter Chart ────────────────────────────── */} + +
+

Value Map: Salary vs Interview Score

+
+

+ Upper-left = best value. Each dot is a candidate model. Size reflects success rate. +

+ + {/* Tier legend */} +
+ {Object.entries(TIER_COLORS).map(([tier, color]) => ( +
+ + {TIER_LABELS[tier]} +
+ ))} +
+ + {scatterData.length === 0 ? ( +
+ +

No candidates match the current filters.

+

+ Try adjusting the provider or success rate filters. +

+
+ ) : ( +
+ + + `$${v}`} + stroke="hsl(var(--muted-foreground))" + strokeOpacity={0.3} + tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }} + axisLine={false} + label={{ + value: "Daily Salary ($)", + position: "insideBottom", + offset: -10, + style: { fontSize: 11, fill: "hsl(var(--muted-foreground))" }, + }} + /> + + + {/* Sweet spot reference zone: upper-left quadrant */} + + } + cursor={{ + strokeDasharray: "3 3", + stroke: "hsl(var(--muted-foreground))", + strokeOpacity: 0.3, + }} + /> + + {scatterData.map((entry, index) => ( + + ))} + + + +
+ )} +
+ {/* ── Chart Section ──────────────────────────────────────── */} = { + anthropic: "#fb923c", // orange-400 + openai: "#4ade80", // green-400 + google: "#60a5fa", // blue-400 + xai: "#c084fc", // purple-400 + deepseek: "#22d3ee", // cyan-400 + moonshot: "#f472b6", // pink-400 +} + +const PROVIDER_DISPLAY: Record = { + anthropic: "Anthropic", + openai: "OpenAI", + google: "Google", + xai: "xAI", + deepseek: "DeepSeek", + moonshot: "Moonshot", +} + +// ── Timeline Tooltip ──────────────────────────────────────────────────────── + +function TimelineTooltip({ + active, + payload, +}: { + active?: boolean + // eslint-disable-next-line @typescript-eslint/no-explicit-any + payload?: any[] +}) { + if (!active || !payload || !payload.length) return null + + const data = payload[0]?.payload as + | { + modelName?: string + provider?: string + score?: number + costPerRun?: number + dateLabel?: string + } + | undefined + + if (!data) return null + + return ( +
+

{data.modelName}

+
+
+ + Provider: + {PROVIDER_DISPLAY[data.provider ?? ""] ?? data.provider} +
+
+ + Release: + {data.dateLabel} +
+
+ + Eval Score: + {data.score}% +
+
+ + Cost per Run: + ${data.costPerRun?.toFixed(2)} +
+
+
+ ) +} + // ── Sub-Components ────────────────────────────────────────────────────────── function StatPill({ icon: Icon, value, label }: { icon: LucideIcon; value: string; label: string }) { @@ -218,6 +296,33 @@ export function WorkersContent({ }: WorkersContentProps) { const recByRole = new Map(recommendations.map((r) => [r.roleId, r])) + // ── Timeline scatter data ────────────────────────────────────────────── + const timelineData = useMemo(() => { + const maxCost = Math.max(...MODEL_TIMELINE.map((m) => m.costPerRun)) + return MODEL_TIMELINE.map((m) => { + const date = new Date(m.releaseDate) + return { + modelName: m.modelName, + provider: m.provider, + score: m.score, + costPerRun: m.costPerRun, + // numeric X for scatter: days since epoch + dateNum: date.getTime(), + dateLabel: date.toLocaleDateString("en-US", { month: "short", year: "numeric" }), + // Dot size: inversely proportional to cost (cheaper = bigger dot) + dotSize: Math.round(60 + (1 - m.costPerRun / maxCost) * 340), + } + }).sort((a, b) => a.dateNum - b.dateNum) + }, []) + + // Trend line endpoints for the timeline + const trendLine = useMemo(() => { + if (timelineData.length < 2) return null + const first = timelineData[0]! + const last = timelineData[timelineData.length - 1]! + return { x1: first.dateNum, y1: first.score, x2: last.dateNum, y2: last.score } + }, [timelineData]) + return ( <> {/* ── Hero Section ───────────────────────────────────────────── */} @@ -455,6 +560,164 @@ export function WorkersContent({
+ {/* ── AI Coding Capability Over Time ─────────────────────────── */} +
+ {/* Subtle atmospheric background */} + +
+
+
+ + +
+ + {/* Section header */} + +

+ AI Coding Capability{" "} + + Over Time + +

+

+ Pass rates on our eval suite, by model release date. The best ones now score 100%. +

+
+ + {/* Chart container */} + + {/* Provider legend */} +
+ {Object.entries(PROVIDER_COLORS) + .filter(([provider]) => MODEL_TIMELINE.some((m) => m.provider === provider)) + .map(([provider, color]) => ( +
+ + {PROVIDER_DISPLAY[provider] ?? provider} +
+ ))} +
+ + Bigger dot = lower cost +
+
+ +
+ + + { + const d = new Date(v) + return d.toLocaleDateString("en-US", { + month: "short", + year: "2-digit", + }) + }} + stroke="hsl(var(--muted-foreground))" + strokeOpacity={0.3} + tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }} + axisLine={false} + label={{ + value: "Release Date", + position: "insideBottom", + offset: -10, + style: { fontSize: 11, fill: "hsl(var(--muted-foreground))" }, + }} + /> + `${v}%`} + stroke="hsl(var(--muted-foreground))" + strokeOpacity={0.3} + tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }} + axisLine={false} + label={{ + value: "Eval Score (%)", + angle: -90, + position: "insideLeft", + offset: 10, + style: { fontSize: 11, fill: "hsl(var(--muted-foreground))" }, + }} + /> + + {/* Trend line: dashed line from first to last */} + {trendLine && ( + + )} + {/* 100% reference line */} + + } + cursor={{ + strokeDasharray: "3 3", + stroke: "hsl(var(--muted-foreground))", + strokeOpacity: 0.3, + }} + /> + + {timelineData.map((entry, index) => ( + + ))} + + + +
+
+
+
+
+ {/* ── Footer / Methodology Section ───────────────────────────── */}
diff --git a/apps/web-roo-code/src/lib/mock-recommendations.ts b/apps/web-roo-code/src/lib/mock-recommendations.ts index 214d12bd27c..8c57d6c147d 100644 --- a/apps/web-roo-code/src/lib/mock-recommendations.ts +++ b/apps/web-roo-code/src/lib/mock-recommendations.ts @@ -864,3 +864,27 @@ export function getCloudSetupUrl(candidate: ModelCandidate): string { }) return `https://app.roocode.com/sign-up?${params.toString()}` } + +// ── Model Timeline Data ──────────────────────────────────────────────────── +// Historical model performance over time for the landing page chart. + +export type ModelTimelineEntry = { + modelName: string + provider: string + releaseDate: string // ISO date + score: number // our eval score (total %) + costPerRun: number // total cost for the full eval run +} + +export const MODEL_TIMELINE: ModelTimelineEntry[] = [ + { modelName: "Claude 3.5 Sonnet", provider: "anthropic", releaseDate: "2025-06-20", score: 90, costPerRun: 24.98 }, + { modelName: "GPT-4.1", provider: "openai", releaseDate: "2025-08-14", score: 91, costPerRun: 38.64 }, + { modelName: "Claude 3.7 Sonnet", provider: "anthropic", releaseDate: "2025-09-15", score: 95, costPerRun: 37.58 }, + { modelName: "Gemini 2.5 Pro", provider: "google", releaseDate: "2025-10-01", score: 96, costPerRun: 57.8 }, + { modelName: "Claude Sonnet 4", provider: "anthropic", releaseDate: "2025-11-01", score: 98, costPerRun: 39.61 }, + { modelName: "GPT-5 Mini", provider: "openai", releaseDate: "2025-12-01", score: 99, costPerRun: 3.34 }, + { modelName: "Claude Sonnet 4.5", provider: "anthropic", releaseDate: "2026-01-15", score: 100, costPerRun: 38.43 }, + { modelName: "GPT 5.2 (Med)", provider: "openai", releaseDate: "2026-01-20", score: 100, costPerRun: 12.5 }, + { modelName: "Claude Opus 4.6", provider: "anthropic", releaseDate: "2026-02-01", score: 100, costPerRun: 49.48 }, + { modelName: "Gemini 3 Pro", provider: "google", releaseDate: "2026-02-05", score: 100, costPerRun: 33.06 }, +] From 6c849fb8c3d33f08e6962fb84da685cbfbaed5da Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Wed, 11 Feb 2026 19:57:58 -0800 Subject: [PATCH 07/22] refactor(web-evals): generalize methodology roles section for scalability --- .../evals/methodology/methodology-content.tsx | 189 ++++++++---------- 1 file changed, 87 insertions(+), 102 deletions(-) diff --git a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx index 18d9bcef3a2..0285be5970a 100644 --- a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx +++ b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx @@ -5,8 +5,6 @@ import { ArrowRight, FlaskConical, Code, - GitBranch, - Building2, AlertTriangle, BarChart3, Terminal, @@ -17,6 +15,7 @@ import { DollarSign, Zap, Trophy, + Scale, } from "lucide-react" import Link from "next/link" @@ -476,126 +475,112 @@ export function MethodologyContent() { Engineer Roles - - Not every task needs the same level of engineering. Three role tiers, each with different - exercise difficulty and scoring weights. - +

+ Each role represents a different engineering seniority level. We test models against + exercises matched to that role's complexity, then score using role-specific weights. +

+ - {/* Role cards */} - - {/* Junior */} + {/* How weights differ */} + -
-
-
- -
-

Junior Engineer

-

- Easy + Medium exercises. Boilerplate, simple bug fixes, test generation. Scoring - emphasizes{" "} - cost efficiency. -

- {/* Weight breakdown */} -
-

- Scoring Weights -

-
-
-
-
-
-
-
- Success 35% - Quality 15% - Cost 35% - Speed 15% -
-
-
+

Different Roles, Different Weights

+

+ Each role has its own scoring weights. A model that's great for simple tasks might + not rank for architecture decisions. +

- - {/* Senior */} -
-
-
- +

Matched Exercises

+

+ Budget roles get simpler exercises. Complex roles get harder ones. The difficulty and + scoring shift together so recommendations stay relevant. +

+ + + + {/* Budget vs Complex comparison */} + +

+ How Scoring Weights Shift +

+
+ {/* Budget roles */} +
+
+
-

Senior Engineer

-

- Medium exercises. Feature development, debugging, code review. Balanced scoring with - emphasis on{" "} - success rate + quality - . -

- {/* Weight breakdown */} -
-

- Scoring Weights +

+

Budget Roles

+

+ Cost and speed matter most. Simpler exercises where many models succeed, so + efficiency breaks the tie.

-
-
-
-
-
+
+
+
+
+
+
+
-
- Success 40% - Quality 25% - Cost 20% - Speed 15% +
+ Success + Quality + Cost ↑ + Speed
- - - {/* Staff */} - -
-
-
- + {/* Complex roles */} +
+
+
-

Staff Engineer

-

- Hard exercises. Architecture, ambiguous requirements, system design. Scoring - prioritizes{" "} - - reasoning quality + correctness - - . -

- {/* Weight breakdown */} -
-

- Scoring Weights +

+

Complex Roles

+

+ Reasoning quality and success rate matter most. Harder exercises where only the + best models deliver.

-
-
-
-
-
+
+
+
+
+
+
+
-
- Success 45% - Quality 30% - Cost 10% - Speed 15% +
+ + Success ↑ + + + Quality ↑ + + Cost + Speed
- +
+ + + {/* Link to roles page */} + + + Browse all engineer roles + +
From 514dadfe472a0d4dfbd11ac7eceae3f69a27e67c Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Wed, 11 Feb 2026 20:00:59 -0800 Subject: [PATCH 08/22] fix(web-evals): pluralize team members text and increase card row gap --- apps/web-roo-code/src/app/evals/workers/workers-content.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx index 52fc89c1d21..755cc4e3b7d 100644 --- a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx @@ -417,13 +417,13 @@ export function WorkersContent({ viewport={{ once: true }} variants={fadeUpVariants}>

- Choose your agentic team member + Choose your agentic team members

Date: Wed, 11 Feb 2026 20:04:26 -0800 Subject: [PATCH 09/22] style(web-evals): remove tilde prefix from dollar amounts --- .../workers/[roleId]/candidates-content.tsx | 8 +++--- .../[roleId]/compare/comparison-chart.tsx | 2 +- .../src/lib/mock-recommendations.ts | 28 +++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx index c4a42a71f8f..842a7ebb6fc 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx @@ -436,7 +436,7 @@ function CandidateCard({ Daily Cost

- ~${Math.round(candidate.estimatedDailyCost)}/day + ${Math.round(candidate.estimatedDailyCost)}/day

(${candidate.avgCostPerTask.toFixed(3)}/task) @@ -569,7 +569,7 @@ function CompactCard({

{highlight === "cost" - ? `~$${Math.round(candidate.estimatedDailyCost)}/day` + ? `$${Math.round(candidate.estimatedDailyCost)}/day` : `${candidate.avgTimePerTask.toFixed(1)}s`}

{highlight === "cost" && ( @@ -591,7 +591,7 @@ function CompactCard({ Daily Cost

- ~${Math.round(candidate.estimatedDailyCost)} + ${Math.round(candidate.estimatedDailyCost)}

(${candidate.avgCostPerTask.toFixed(3)}/task) @@ -944,7 +944,7 @@ export function CandidatesContent({ {candidate.successRate}% - ~${Math.round(candidate.estimatedDailyCost)} + ${Math.round(candidate.estimatedDailyCost)} (${candidate.avgCostPerTask.toFixed(3)}) diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx index 8e5aa85b7fd..998e1af261f 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -360,7 +360,7 @@ function CustomTooltip({ {entry.name}: {entry.dataKey === "costEfficiency" && dailyCost !== undefined - ? `${entry.value} (~$${dailyCost}/day · $${costPerTask?.toFixed(3)}/task)` + ? `${entry.value} ($${dailyCost}/day · $${costPerTask?.toFixed(3)}/task)` : entry.value}

diff --git a/apps/web-roo-code/src/lib/mock-recommendations.ts b/apps/web-roo-code/src/lib/mock-recommendations.ts index 8c57d6c147d..99cc7f9b800 100644 --- a/apps/web-roo-code/src/lib/mock-recommendations.ts +++ b/apps/web-roo-code/src/lib/mock-recommendations.ts @@ -20,7 +20,7 @@ export const TASKS_PER_DAY = 80 export type EngineerRole = { id: string name: string - /** Daily salary range string, e.g. "~$3–38/day" */ + /** Daily salary range string, e.g. "$3–38/day" */ salaryRange: string description: string bestFor: string[] @@ -81,7 +81,7 @@ const ENGINEER_ROLES: EngineerRole[] = [ { id: "junior", name: "Junior Engineer", - salaryRange: "~$2–10/day", + salaryRange: "$2–10/day", description: "Handles well-scoped, single-file tasks: boilerplate, simple bug fixes, and test generation at the lowest cost per task.", bestFor: ["Single-file fixes", "Boilerplate generation", "Test generation", "Simple implementations"], @@ -92,7 +92,7 @@ const ENGINEER_ROLES: EngineerRole[] = [ { id: "senior", name: "Senior Engineer", - salaryRange: "~$10–26/day", + salaryRange: "$10–26/day", description: "The sweet spot for most engineering work. Senior-tier models balance cost and quality across multi-file refactors, feature development, and debugging.", bestFor: ["Multi-file refactors", "Feature development", "Debugging", "Code review"], @@ -107,7 +107,7 @@ const ENGINEER_ROLES: EngineerRole[] = [ { id: "staff", name: "Staff Engineer", - salaryRange: "~$8–34/day", + salaryRange: "$8–34/day", description: "For architecture decisions, system design, and complex refactors. Staff-tier models handle ambiguous requirements and cross-cutting changes where other tiers fail.", bestFor: ["Architecture decisions", "Complex features", "System design", "Ambiguous requirements"], @@ -122,7 +122,7 @@ const ENGINEER_ROLES: EngineerRole[] = [ { id: "reviewer", name: "Architecture Reviewer", - salaryRange: "~$15–40/day", + salaryRange: "$15–40/day", description: "For code review, PR feedback, security analysis, and design critique. Reviewer-tier models catch issues other models miss and provide actionable, context-aware suggestions.", bestFor: ["Code review", "PR feedback", "Security analysis", "Design critique", "Refactor guidance"], @@ -141,7 +141,7 @@ const ENGINEER_ROLES: EngineerRole[] = [ { id: "autonomous", name: "Autonomous Agent", - salaryRange: "~$5–30/day", + salaryRange: "$5–30/day", description: "For issue-to-PR workflows, long-running tasks, and multi-step debugging with minimal supervision. Autonomous-tier models complete tasks end-to-end and recover from errors without human intervention.", bestFor: [ @@ -414,7 +414,7 @@ const seniorCandidates: ModelCandidate[] = [ avgTimePerTask: 188.5, languageScores: { go: 97, java: 91, javascript: 96, python: 100, rust: 97 }, settings: { temperature: 0 }, - caveats: ["Most expensive in this tier: ~$39/day ($0.48/task)"], + caveats: ["Most expensive in this tier: $39/day ($0.48/task)"], }, ] @@ -434,7 +434,7 @@ const staffCandidates: ModelCandidate[] = [ avgTimePerTask: 105.5, languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, settings: { temperature: 0, reasoningEffort: "medium" }, - caveats: ["100% pass rate at ~$8/day ($0.10/task): best cost-to-quality ratio in this role"], + caveats: ["100% pass rate at $8/day ($0.10/task): best cost-to-quality ratio in this role"], }, { provider: "anthropic", @@ -505,7 +505,7 @@ const staffCandidates: ModelCandidate[] = [ avgTimePerTask: 211.5, languageScores: { go: 97, java: 96, javascript: 98, python: 100, rust: 100 }, settings: { temperature: 0 }, - caveats: ["~$93/day ($1.17/task), 11× the cost of the top pick"], + caveats: ["$93/day ($1.17/task), 11× the cost of the top pick"], }, { provider: "openai", @@ -536,7 +536,7 @@ const staffCandidates: ModelCandidate[] = [ languageScores: { go: 92, java: 91, javascript: 94, python: 94, rust: 100 }, settings: { temperature: 0 }, caveats: [ - "Most expensive model tested: ~$115/day ($1.44/task)", + "Most expensive model tested: $115/day ($1.44/task)", "Lower success rate (94%) despite highest cost", ], }, @@ -560,7 +560,7 @@ const reviewerCandidates: ModelCandidate[] = [ avgTimePerTask: 105.5, languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, settings: { temperature: 0, reasoningEffort: "medium" }, - caveats: ["100% consistency across all languages: ideal reviewer at ~$8/day"], + caveats: ["100% consistency across all languages: ideal reviewer at $8/day"], }, { provider: "anthropic", @@ -646,7 +646,7 @@ const reviewerCandidates: ModelCandidate[] = [ avgTimePerTask: 188.5, languageScores: { go: 97, java: 91, javascript: 96, python: 100, rust: 97 }, settings: { temperature: 0 }, - caveats: ["Most expensive reviewer: ~$39/day ($0.48/task)", "More variable across languages than top picks"], + caveats: ["Most expensive reviewer: $39/day ($0.48/task)", "More variable across languages than top picks"], }, { provider: "openai", @@ -683,7 +683,7 @@ const autonomousCandidates: ModelCandidate[] = [ avgTimePerTask: 105.5, languageScores: { go: 100, java: 100, javascript: 100, python: 100, rust: 100 }, settings: { temperature: 0, reasoningEffort: "medium" }, - caveats: ["Perfect success rate + fast completion: ideal autonomous agent at ~$8/day"], + caveats: ["Perfect success rate + fast completion: ideal autonomous agent at $8/day"], }, { provider: "openai", @@ -698,7 +698,7 @@ const autonomousCandidates: ModelCandidate[] = [ avgTimePerTask: 173.0, languageScores: { go: 100, java: 98, javascript: 100, python: 100, rust: 97 }, settings: { temperature: 0 }, - caveats: ["Cheapest autonomous option at ~$2/day with near-perfect success"], + caveats: ["Cheapest autonomous option at $2/day with near-perfect success"], }, { provider: "xai", From 310b10cdc2f0580d52e3ac2f4174f9ba274c3a92 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Thu, 12 Feb 2026 12:07:14 -0800 Subject: [PATCH 10/22] feat: eval-evolution work in progress - recommendations and workers UI --- .../evals/methodology/methodology-content.tsx | 22 +- .../src/app/evals/methodology/page.tsx | 4 +- .../workers-v2/[roleId]/compare/page.tsx | 83 ++ .../app/evals/workers-v2/[roleId]/page.tsx | 100 +++ .../src/app/evals/workers-v2/page.tsx | 91 +++ .../workers/[roleId]/candidates-content.tsx | 51 +- .../[roleId]/compare/comparison-chart.tsx | 71 +- .../evals/workers/[roleId]/compare/page.tsx | 21 +- .../src/app/evals/workers/[roleId]/page.tsx | 10 +- .../src/app/evals/workers/page.tsx | 20 +- .../src/app/evals/workers/workers-content.tsx | 749 ++++++++++++++---- apps/web-roo-code/src/lib/eval-outcomes.ts | 171 ++++ .../src/lib/mock-recommendations.ts | 66 +- 13 files changed, 1205 insertions(+), 254 deletions(-) create mode 100644 apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/workers-v2/page.tsx create mode 100644 apps/web-roo-code/src/lib/eval-outcomes.ts diff --git a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx index 0285be5970a..654b1740feb 100644 --- a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx +++ b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx @@ -198,19 +198,19 @@ export function MethodologyContent() { / - Hire an AI Engineer + Build with Roo Code Cloud / - How We Interview + Methodology {/* Heading */} - How We Interview{" "} + How We Run{" "} - AI Models + Evals @@ -262,7 +262,7 @@ export function MethodologyContent() { - The Interview Process + The Eval Process {/* ════════════════════════════════════════════════════════════════ - SECTION 02: THE INTERVIEW SUITE + SECTION 02: THE EVAL SUITE ════════════════════════════════════════════════════════════════ */} - The Interview Suite + The Eval Suite {/* ════════════════════════════════════════════════════════════════ - SECTION 05: RUN YOUR OWN INTERVIEWS + SECTION 05: RUN YOUR OWN EVALS ════════════════════════════════════════════════════════════════ */} - Run Your Own Interviews + Run Your Own Evals - Our evaluation framework is fully open source. Run the exact same interviews on your own + Our evaluation framework is fully open source. Run the exact same evals on your own infrastructure, with your own API keys, against any model. diff --git a/apps/web-roo-code/src/app/evals/methodology/page.tsx b/apps/web-roo-code/src/app/evals/methodology/page.tsx index 8a0960142cb..039d7e6fd75 100644 --- a/apps/web-roo-code/src/app/evals/methodology/page.tsx +++ b/apps/web-roo-code/src/app/evals/methodology/page.tsx @@ -7,7 +7,7 @@ import { MethodologyContent } from "./methodology-content" // ── SEO Metadata ──────────────────────────────────────────────────────────── -const TITLE = "How We Interview AI Models | Roo Code Evals" +const TITLE = "Methodology | Roo Code Evals" const DESCRIPTION = "Our methodology for evaluating AI coding models. Transparent, reproducible, evidence-based." const OG_DESCRIPTION = "Our methodology for evaluating AI coding models" const PATH = "/evals/methodology" @@ -46,7 +46,7 @@ export const metadata: Metadata = { "model benchmarking", "coding evals", "methodology", - "interview process", + "evaluation process", "transparent evaluation", ], } diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx new file mode 100644 index 00000000000..2176eaae88b --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx @@ -0,0 +1,83 @@ +import { notFound } from "next/navigation" +import type { Metadata } from "next" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" +import { getEngineerRole, getRoleRecommendation } from "@/lib/mock-recommendations" + +import { ComparisonChart } from "../../../workers/[roleId]/compare/comparison-chart" + +type PageProps = { params: Promise<{ roleId: string }> } + +export async function generateMetadata({ params }: PageProps): Promise { + const { roleId } = await params + const role = getEngineerRole(roleId) + + if (!role) { + return { + title: "Role Not Found | Roo Code Evals", + description: "The requested role was not found.", + } + } + + const title = `Compare Models — ${role.name} (V2 Preview) | Roo Code Evals` + const description = `Outcome-first comparison of AI models for ${role.name}. Compare composite score, success rate, cost efficiency, and speed.` + const ogDescription = `Compare Models — ${role.name} (V2 Preview)` + const path = `/evals/workers-v2/${roleId}/compare` + + return { + title, + description, + alternates: { + canonical: `${SEO.url}${path}`, + }, + openGraph: { + title, + description, + url: `${SEO.url}${path}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(title, ogDescription), + width: 1200, + height: 630, + alt: title, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title, + description, + images: [ogImageUrl(title, ogDescription)], + }, + keywords: [ + ...SEO.keywords, + "AI coding", + "model comparison", + "coding evals", + role.name.toLowerCase(), + "outcome-first", + ], + } +} + +export default async function WorkersV2ComparePage({ params }: PageProps) { + const { roleId } = await params + const recommendation = getRoleRecommendation(roleId) + + if (!recommendation) { + notFound() + } + + return ( + + ) +} diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx new file mode 100644 index 00000000000..8afef01a589 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx @@ -0,0 +1,100 @@ +import { notFound } from "next/navigation" +import type { Metadata } from "next" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" +import { getRoleRecommendation, getCloudSetupUrl } from "@/lib/mock-recommendations" + +import { CandidatesContent } from "../../workers/[roleId]/candidates-content" + +type PageProps = { params: Promise<{ roleId: string }> } + +export async function generateMetadata({ params }: PageProps): Promise { + const { roleId } = await params + const recommendation = getRoleRecommendation(roleId) + + if (!recommendation) { + return { + title: "Role Not Found | Roo Code Evals", + description: "The requested role was not found.", + } + } + + const { role } = recommendation + const title = `${role.name} — Recommended Models (V2 Preview) | Roo Code Evals` + const description = `Outcome-first recommendations for ${role.name}. Compare models by success rate, cost, and speed across 5 languages.` + const ogDescription = `${role.name} — Recommended Models (V2 Preview)` + const path = `/evals/workers-v2/${roleId}` + + return { + title, + description, + alternates: { + canonical: `${SEO.url}${path}`, + }, + openGraph: { + title, + description, + url: `${SEO.url}${path}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(title, ogDescription), + width: 1200, + height: 630, + alt: title, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title, + description, + images: [ogImageUrl(title, ogDescription)], + }, + keywords: [ + ...SEO.keywords, + "AI coding", + "coding agents", + "model recommendations", + "coding evals", + role.name.toLowerCase(), + "outcome-first", + ], + } +} + +export default async function WorkersV2RolePage({ params }: PageProps) { + const { roleId } = await params + const recommendation = getRoleRecommendation(roleId) + + if (!recommendation) { + notFound() + } + + const { role, best, budgetHire, speedHire, allCandidates, totalEvalRuns, totalExercises, lastUpdated } = + recommendation + + const cloudUrls: Record = {} + for (const candidate of allCandidates) { + cloudUrls[candidate.modelId] = getCloudSetupUrl(candidate) + } + + return ( + + ) +} diff --git a/apps/web-roo-code/src/app/evals/workers-v2/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx new file mode 100644 index 00000000000..5196214f680 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx @@ -0,0 +1,91 @@ +import type { Metadata } from "next" +import { Fraunces, IBM_Plex_Sans } from "next/font/google" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" +import { getEngineerRoles, getAllRecommendations } from "@/lib/mock-recommendations" + +import { WorkersContent } from "../workers/workers-content" + +const TITLE = "Build with Roo Code Cloud (V2 Preview) | Roo Code Evals" +const DESCRIPTION = + "Outcome-first, eval-backed recommendations for shipping production code. Start from what you need to ship and pick a setup." +const OG_DESCRIPTION = "Outcome-first recommendations for shipping production code" +const PATH = "/evals/workers-v2" + +const display = Fraunces({ subsets: ["latin"], variable: "--font-display" }) +const body = IBM_Plex_Sans({ subsets: ["latin"], weight: ["400", "500", "600"], variable: "--font-body" }) + +export const metadata: Metadata = { + title: TITLE, + description: DESCRIPTION, + alternates: { + canonical: `${SEO.url}${PATH}`, + }, + openGraph: { + title: TITLE, + description: DESCRIPTION, + url: `${SEO.url}${PATH}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(TITLE, OG_DESCRIPTION), + width: 1200, + height: 630, + alt: TITLE, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title: TITLE, + description: DESCRIPTION, + images: [ogImageUrl(TITLE, OG_DESCRIPTION)], + }, + keywords: [ + ...SEO.keywords, + "AI coding", + "coding agents", + "roo code cloud", + "model recommendations", + "coding evals", + "shipping code", + "prototype", + "outcome-first", + ], +} + +export default function WorkersV2Page() { + const roles = getEngineerRoles() + const recommendations = getAllRecommendations() + + const totalEvalRuns = recommendations.reduce((sum, recommendation) => sum + recommendation.totalEvalRuns, 0) + const totalExercises = recommendations.reduce((sum, recommendation) => sum + recommendation.totalExercises, 0) + const uniqueModels = new Set( + recommendations.flatMap((recommendation) => recommendation.allCandidates.map((candidate) => candidate.modelId)), + ) + const totalModels = uniqueModels.size + const lastUpdated = recommendations + .map((recommendation) => recommendation.lastUpdated) + .sort() + .pop() + + return ( +
+ +
+ ) +} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx index 842a7ebb6fc..9d48a2b0955 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx @@ -21,6 +21,7 @@ import { } from "lucide-react" import type { LucideIcon } from "lucide-react" import Link from "next/link" +import { useSearchParams } from "next/navigation" import type { ModelCandidate, LanguageScores, EngineerRole } from "@/lib/mock-recommendations" @@ -485,7 +486,7 @@ function CandidateCard({ target="_blank" rel="noopener noreferrer" className={`inline-flex w-full items-center justify-center gap-2 rounded-xl ${theme.buttonBg} ${theme.buttonHover} px-4 py-3 text-sm font-semibold text-white transition-all duration-200 hover:scale-[1.02] active:scale-[0.98] shadow-lg`}> - ☁️ Hire This Engineer + ☁️ Open in Roo Code Cloud @@ -620,7 +621,7 @@ function CompactCard({ target="_blank" rel="noopener noreferrer" className={`inline-flex w-full items-center justify-center gap-2 rounded-xl ${theme.buttonBg} ${theme.buttonHover} px-4 py-3 text-sm font-semibold text-white transition-all duration-200 hover:scale-[1.02] active:scale-[0.98] shadow-lg`}> - ☁️ Hire This Engineer + ☁️ Open in Roo Code Cloud @@ -643,6 +644,7 @@ export type CandidatesContentProps = { totalExercises: number lastUpdated: string cloudUrls: Record + workersRootPath?: string } // ── Main Content Component ────────────────────────────────────────────────── @@ -658,9 +660,22 @@ export function CandidatesContent({ totalExercises, lastUpdated, cloudUrls, + workersRootPath = "/evals/workers", }: CandidatesContentProps) { + const searchParams = useSearchParams() const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME const IconComponent = ICON_MAP[role.icon] ?? Code + const alternateWorkersRootPath = workersRootPath === "/evals/workers-v2" ? "/evals/workers" : "/evals/workers-v2" + const alternateVersionLabel = workersRootPath === "/evals/workers-v2" ? "View baseline" : "View V2 preview" + const setupQuery = (() => { + const outcome = searchParams.get("outcome") + if (!outcome) return "" + const params = new URLSearchParams() + params.set("outcome", outcome) + const mode = searchParams.get("mode") + if (mode) params.set("mode", mode) + return `?${params.toString()}` + })() return ( <> @@ -693,8 +708,10 @@ export function CandidatesContent({ Evals / - - Hire an AI Engineer + + Build with Roo Code Cloud / {role.name} @@ -753,9 +770,15 @@ export function CandidatesContent({ href="/evals/methodology" className="group inline-flex items-center gap-1.5 text-sm font-medium text-muted-foreground transition-colors hover:text-foreground"> - How we interview + Methodology +
+ + {alternateVersionLabel} + {/* Strengths + Trade-offs grid */} @@ -792,7 +815,7 @@ export function CandidatesContent({
- {/* ── Top Candidates: Best Overall ────────────────────────────── */} + {/* ── Top Models: Best Overall ────────────────────────────────── */}
- Top Candidates + Top Models @@ -835,7 +858,7 @@ export function CandidatesContent({ {budgetHire && ( )} - {/* ── All Candidates Table ────────────────────────────────────── */} + {/* ── All Models Table ────────────────────────────────────────── */}
{/* Subtle background */} - All Candidates + All Models 📊 Compare all candidates @@ -983,7 +1006,7 @@ export function CandidatesContent({ variants={containerVariants}> Back to all roles @@ -991,7 +1014,7 @@ export function CandidatesContent({ 📊 Compare candidates diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx index 998e1af261f..d03f83fe67b 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -2,6 +2,7 @@ import { useState, useMemo, useCallback } from "react" import Link from "next/link" +import { useSearchParams } from "next/navigation" import { motion } from "framer-motion" import { ArrowLeft, @@ -409,12 +410,12 @@ function ScatterTooltip({
- Daily Salary: + Daily Spend: ${data.dailyCost}/day
- Interview Score: + Eval Score: {data.score}
@@ -433,11 +434,29 @@ interface ComparisonChartProps { recommendation: RoleRecommendation role: EngineerRole roleId: string + workersRootPath?: string } -export function ComparisonChart({ recommendation, role, roleId }: ComparisonChartProps) { +export function ComparisonChart({ + recommendation, + role, + roleId, + workersRootPath = "/evals/workers", +}: ComparisonChartProps) { + const searchParams = useSearchParams() const { allCandidates } = recommendation const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME + const alternateWorkersRootPath = workersRootPath === "/evals/workers-v2" ? "/evals/workers" : "/evals/workers-v2" + const alternateVersionLabel = workersRootPath === "/evals/workers-v2" ? "View baseline" : "View V2 preview" + const setupQuery = (() => { + const outcome = searchParams.get("outcome") + if (!outcome) return "" + const params = new URLSearchParams() + params.set("outcome", outcome) + const mode = searchParams.get("mode") + if (mode) params.set("mode", mode) + return `?${params.toString()}` + })() // ── State ─────────────────────────────────────────────────────────────── const [selectedLanguage, setSelectedLanguage] = useState("all") @@ -561,15 +580,25 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar Evals / - - Hire an AI Engineer + + Build with Roo Code Cloud / - + {role.name} / - Compare Candidates + Compare Models + / + + {alternateVersionLabel} + {/* Title row */} @@ -579,7 +608,7 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar
-

Compare Candidates

+

Compare Models

{role.name}

@@ -599,7 +628,7 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar {filteredCandidates.length} - of {allCandidates.length} candidates shown + of {allCandidates.length} models shown
@@ -731,10 +760,10 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar className="rounded-2xl border border-border/50 bg-card/50 p-6 backdrop-blur-sm" variants={fadeUpVariants}>
-

Value Map: Salary vs Interview Score

+

Value Map: Spend vs Eval Score

- Upper-left = best value. Each dot is a candidate model. Size reflects success rate. + Upper-left = higher score at lower spend. Each dot is a model. Size reflects success rate.

{/* Tier legend */} @@ -753,7 +782,7 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar {scatterData.length === 0 ? (
-

No candidates match the current filters.

+

No models match the current filters.

Try adjusting the provider or success rate filters.

@@ -765,7 +794,7 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar `$${v}`} stroke="hsl(var(--muted-foreground))" @@ -773,7 +802,7 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar tick={{ fontSize: 11, fill: "hsl(var(--muted-foreground))" }} axisLine={false} label={{ - value: "Daily Salary ($)", + value: "Daily Spend ($)", position: "insideBottom", offset: -10, style: { fontSize: 11, fill: "hsl(var(--muted-foreground))" }, @@ -782,14 +811,14 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar - + {scatterData.map((entry, index) => ( -

No candidates match the current filters.

+

No models match the current filters.

Try adjusting the provider or success rate filters.

@@ -993,15 +1022,15 @@ export function ComparisonChart({ recommendation, role, roleId }: ComparisonChar
- Back to {role.name} candidates + Back to {role.name} models
All roles diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx index d79b22eedda..9d03aa9cca7 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx @@ -22,9 +22,9 @@ export async function generateMetadata({ params }: PageProps): Promise } } - const title = `Compare Candidates — ${role.name} | Roo Code Evals` - const description = `Interactive comparison of AI model candidates for the ${role.name} role. Compare composite score, success rate, cost efficiency, and speed.` - const ogDescription = `Compare Candidates — ${role.name}` + const title = `Compare Models — ${role.name} | Roo Code Evals` + const description = `Interactive comparison of AI models for the ${role.name} setup. Compare composite score, success rate, cost efficiency, and speed.` + const ogDescription = `Compare Models — ${role.name}` const path = `/evals/workers/${roleId}/compare` return { @@ -57,19 +57,19 @@ export async function generateMetadata({ params }: PageProps): Promise }, keywords: [ ...SEO.keywords, - "AI engineer", + "AI coding", "model comparison", "coding evals", role.name.toLowerCase(), "bar chart", - "candidate comparison", + "model comparison", ], } } // ── Page Component ────────────────────────────────────────────────────────── -export default async function CompareCandidatesPage({ params }: PageProps) { +export default async function CompareModelsPage({ params }: PageProps) { const { roleId } = await params const recommendation = getRoleRecommendation(roleId) @@ -77,5 +77,12 @@ export default async function CompareCandidatesPage({ params }: PageProps) { notFound() } - return + return ( + + ) } diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx index 8daa554cd2d..54ca16d0d26 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx @@ -23,9 +23,9 @@ export async function generateMetadata({ params }: PageProps): Promise } const { role } = recommendation - const title = `${role.name} — AI Engineer Candidates | Roo Code Evals` - const description = `Interview results for ${role.name} AI candidates. Compare models by success rate, cost, and speed across 5 languages.` - const ogDescription = `${role.name} — AI Engineer Candidates` + const title = `${role.name} — Recommended Models | Roo Code Evals` + const description = `Eval-backed recommendations for ${role.name}. Compare models by success rate, cost, and speed across 5 languages.` + const ogDescription = `${role.name} — Recommended Models` const path = `/evals/workers/${roleId}` return { @@ -58,7 +58,8 @@ export async function generateMetadata({ params }: PageProps): Promise }, keywords: [ ...SEO.keywords, - "AI engineer", + "AI coding", + "coding agents", "model recommendations", "coding evals", role.name.toLowerCase(), @@ -99,6 +100,7 @@ export default async function RoleCandidatesPage({ params }: PageProps) { totalExercises={totalExercises} lastUpdated={lastUpdated} cloudUrls={cloudUrls} + workersRootPath="/evals/workers" /> ) } diff --git a/apps/web-roo-code/src/app/evals/workers/page.tsx b/apps/web-roo-code/src/app/evals/workers/page.tsx index d7a8f5cc17a..a2b95a1bd88 100644 --- a/apps/web-roo-code/src/app/evals/workers/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers/page.tsx @@ -8,10 +8,10 @@ import { WorkersContent } from "./workers-content" // ── SEO Metadata ──────────────────────────────────────────────────────────── -const TITLE = "Hire an AI Engineer | Roo Code Evals" +const TITLE = "Build with Roo Code Cloud | Roo Code Evals" const DESCRIPTION = - "Find the right AI coding model for your team. Compare interview results across Junior, Senior, and Staff Engineer roles." -const OG_DESCRIPTION = "Find the right AI coding model for your team" + "Eval-backed model recommendations for shipping production code. Pick a setup based on the work you're doing: single-file fixes, multi-file changes, review, and autonomous runs." +const OG_DESCRIPTION = "Eval-backed model recommendations for shipping production code" const PATH = "/evals/workers" export const metadata: Metadata = { @@ -44,18 +44,20 @@ export const metadata: Metadata = { }, keywords: [ ...SEO.keywords, - "AI engineer", + "AI coding", + "coding agents", + "roo code cloud", "model recommendations", "coding evals", "model comparison", - "hire AI", - "talent marketplace", + "shipping code", + "prototype", ], } // ── Page Component ────────────────────────────────────────────────────────── -export default function HireAnAIEngineerPage() { +export default function WorkersPage() { const roles = getEngineerRoles() const recommendations = getAllRecommendations() @@ -80,6 +82,10 @@ export default function HireAnAIEngineerPage() { totalExercises={totalExercises} totalModels={totalModels} lastUpdated={lastUpdated} + workersRootPath="/evals/workers" + enableOutcomeLayer={false} + alternateVersionHref="/evals/workers-v2" + alternateVersionLabel="View V2 preview" /> ) } diff --git a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx index 755cc4e3b7d..e5c770ea6da 100644 --- a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx @@ -1,6 +1,6 @@ "use client" -import { useMemo } from "react" +import { useCallback, useMemo } from "react" import { motion } from "framer-motion" import { Code, @@ -20,10 +20,12 @@ import { } from "lucide-react" import type { LucideIcon } from "lucide-react" import Link from "next/link" +import { usePathname, useRouter, useSearchParams } from "next/navigation" import { ScatterChart, Scatter, XAxis, YAxis, ZAxis, Tooltip, ResponsiveContainer, Cell, ReferenceLine } from "recharts" import type { EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations" import { TASKS_PER_DAY, MODEL_TIMELINE } from "@/lib/mock-recommendations" +import { EVAL_OUTCOMES, isEvalOutcomeId, type EvalOutcomeId } from "@/lib/eval-outcomes" // ── Icon Mapping ──────────────────────────────────────────────────────────── @@ -139,6 +141,37 @@ const ROLE_THEMES: Record = { const DEFAULT_THEME = ROLE_THEMES.senior! +// ── Outcome Layer: Optimization Modes ────────────────────────────────────── + +type EvalOptimizationMode = "best" | "fastest" | "cost" + +const OPTIMIZATION_MODES: Array<{ + id: EvalOptimizationMode + label: string + description: string +}> = [ + { id: "best", label: "Best", description: "Best overall quality across our eval suite." }, + { id: "fastest", label: "Fastest", description: "Lower latency per task when speed matters." }, + { id: "cost", label: "Most cost-effective", description: "Lower cost per task for high-volume work." }, +] + +function isEvalOptimizationMode(value: string): value is EvalOptimizationMode { + return value === "best" || value === "fastest" || value === "cost" +} + +function getModeCandidate(rec: RoleRecommendation | undefined, mode: EvalOptimizationMode) { + if (!rec) return null + if (mode === "fastest") return rec.speedHire ?? rec.best[0] ?? null + if (mode === "cost") return rec.budgetHire ?? rec.best[0] ?? null + return rec.best[0] ?? null +} + +function getModeLabel(mode: EvalOptimizationMode) { + if (mode === "fastest") return "Fastest" + if (mode === "cost") return "Most cost-effective" + return "Best" +} + // ── Framer Motion Variants ────────────────────────────────────────────────── const containerVariants = { @@ -284,6 +317,10 @@ type WorkersContentProps = { totalExercises: number totalModels: number lastUpdated: string | undefined + workersRootPath?: string + enableOutcomeLayer?: boolean + alternateVersionHref?: string + alternateVersionLabel?: string } export function WorkersContent({ @@ -293,8 +330,91 @@ export function WorkersContent({ totalExercises, totalModels, lastUpdated, + workersRootPath = "/evals/workers", + enableOutcomeLayer = false, + alternateVersionHref, + alternateVersionLabel, }: WorkersContentProps) { + const router = useRouter() + const pathname = usePathname() + const searchParams = useSearchParams() + + const selectedOutcomeId = useMemo(() => { + const outcome = searchParams.get("outcome") + if (!outcome) return null + return isEvalOutcomeId(outcome) ? outcome : null + }, [searchParams]) + + const selectedMode = useMemo((): EvalOptimizationMode => { + const mode = searchParams.get("mode") + if (!mode) return "best" + return isEvalOptimizationMode(mode) ? mode : "best" + }, [searchParams]) + + const setOutcome = useCallback( + (nextOutcomeId: EvalOutcomeId | null) => { + const params = new URLSearchParams(searchParams.toString()) + if (nextOutcomeId) params.set("outcome", nextOutcomeId) + else params.delete("outcome") + + const query = params.toString() + router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false }) + }, + [pathname, router, searchParams], + ) + + const setMode = useCallback( + (nextMode: EvalOptimizationMode) => { + const params = new URLSearchParams(searchParams.toString()) + params.set("mode", nextMode) + + const query = params.toString() + router.replace(query ? `${pathname}?${query}` : pathname, { scroll: false }) + }, + [pathname, router, searchParams], + ) + const recByRole = new Map(recommendations.map((r) => [r.roleId, r])) + const roleById = useMemo(() => new Map(roles.map((r) => [r.id, r])), [roles]) + + const selectedOutcome = useMemo(() => { + if (!selectedOutcomeId) return null + return EVAL_OUTCOMES.find((o) => o.id === selectedOutcomeId) ?? null + }, [selectedOutcomeId]) + + const setupQuery = useMemo(() => { + if (!enableOutcomeLayer || !selectedOutcomeId) return "" + const params = new URLSearchParams() + params.set("outcome", selectedOutcomeId) + params.set("mode", selectedMode) + const query = params.toString() + return query ? `?${query}` : "" + }, [enableOutcomeLayer, selectedOutcomeId, selectedMode]) + + const profileTitle = selectedOutcome?.builderProfile?.title ?? "Your Builder Profile" + const profileDescription = + selectedOutcome?.builderProfile?.description ?? + "A default setup built from our eval signals. It’s a baseline, not a guarantee." + const profileHowItWorks = selectedOutcome?.builderProfile?.howItWorks ?? selectedOutcome?.whyItWorks ?? [] + + const profileCapabilities = useMemo(() => { + if (!selectedOutcome) return [] + const fromProfile = selectedOutcome.builderProfile?.capabilities + if (fromProfile && fromProfile.length > 0) return fromProfile + return selectedOutcome.recommendedRoleIds.map((roleId) => { + const role = roleById.get(roleId) + return { + id: roleId, + name: role?.name ?? roleId, + description: role?.salaryRange ?? "", + roleId, + } + }) + }, [selectedOutcome, roleById]) + + const agentCapabilities = useMemo(() => profileCapabilities.filter((c) => Boolean(c.roleId)), [profileCapabilities]) + + const builtInCapabilities = useMemo(() => profileCapabilities.filter((c) => !c.roleId), [profileCapabilities]) // ── Timeline scatter data ────────────────────────────────────────────── const timelineData = useMemo(() => { @@ -340,6 +460,21 @@ export function WorkersContent({
+ {/* Blueprint grid overlay (V2) */} + {enableOutcomeLayer ? ( +
+ ) : null} + {/* Gradient fade from hero atmosphere to cards */}
@@ -351,33 +486,91 @@ export function WorkersContent({ variants={containerVariants}> {/* Badge */} - - - How we interview AI models - - +
+ + + How we run evals + + + {alternateVersionHref && alternateVersionLabel ? ( + + {alternateVersionLabel} + + ) : null} +
+ {enableOutcomeLayer ? ( + + Outcomes over artifacts + + ) : null} + {/* Heading */} - Hire an{" "} - - AI Engineer - + {enableOutcomeLayer ? ( + <> + Build from outcomes. +
Ship{" "} + + real code + + . + + ) : ( + <> + Build with{" "} + + Roo Code Cloud + + + )}
{/* Subheading */} - Every model runs the same coding tasks, same tools, same time limit. Pick the right - candidate for your team and budget. + {enableOutcomeLayer ? ( + <> + Pick what you're trying to ship. We assemble a Builder Profile: the + capabilities you need, plus a default model recommendation backed by eval data. + + ) : ( + <> + Outcomes over artifacts: start from the production codebase and ship as a reviewable + PR. Every model runs the same tasks, same tools, and the same time limit. Your repo + will differ—treat this as a baseline. + + )} + {enableOutcomeLayer ? ( + + + Start with Prototype → PR + + + + Browse outcomes + + + ) : null} + {/* Stats bar */}
- {/* ── Role Cards Grid ────────────────────────────────────────── */} -
- {/* Subtle section background */} - -
-
-
- - -
- {/* Section connector */} - -

- Choose your agentic team members -

- -
+ {/* ── Outcomes Overlay ───────────────────────────────────────── */} + {enableOutcomeLayer ? ( +
+
+ + +

+ Start with an outcome +

+

+ Pick what you're trying to ship. We assemble a Builder Profile: capabilities + plus a default model recommendation. It's a baseline, not a guarantee. +

+
- - {roles.map((role) => { - const rec = recByRole.get(role.id) - const IconComponent = ICON_MAP[role.icon] ?? Code - const candidateCount = rec?.allCandidates.length ?? 0 - const exerciseCount = rec?.totalExercises ?? 0 - const theme = ROLE_THEMES[role.id] ?? DEFAULT_THEME - const topModel = rec?.best[0] - - return ( - -
- {/* Subtle glow on hover */} -
+ + {EVAL_OUTCOMES.map((outcome) => { + const Icon = outcome.icon + const isSelected = outcome.id === selectedOutcomeId + const isFeatured = outcome.id === "prototype_to_pr" -
- {/* Header: Icon + role badge */} -
-
- + return ( + setOutcome(isSelected ? null : outcome.id)} + className={[ + "group rounded-2xl border bg-card/40 p-5 text-left backdrop-blur-sm transition-all duration-200 hover:bg-card/60", + isSelected + ? "border-foreground/20 ring-1 ring-foreground/15" + : "border-border/50 hover:border-border", + isFeatured ? "lg:col-span-2" : "", + ].join(" ")}> +
+
+ +
+
+ {isFeatured ? ( + + Recommended starting point + + ) : null} +

+ {outcome.name} +

+

+ {outcome.description} +

- {topModel && ( - - Top: {topModel.displayName} - - )}
+
+ ) + })} + - {/* Role name + salary */} -

{role.name}

-

- {role.salaryRange} + {selectedOutcome ? ( + +

+
+

+ {profileTitle}

- - {/* Description */} -

- {role.description} +

+ {selectedOutcome.name} +

+

+ {profileDescription}

- {/* Best for */} -
-

- Best for -

-
- {role.bestFor.map((item) => ( - - {item} - - ))} -
-
- - {/* Strengths & Weaknesses side by side */} -
- {/* Strengths */} -
-

- Strengths -

-
    - {role.strengths.map((item) => ( -
  • - - {item} + {profileHowItWorks.length > 0 ? ( +
    +

    + {selectedOutcome.builderProfile + ? "How it works" + : "Why it works"} +

    +
      + {profileHowItWorks.map((line) => ( +
    • + + {line}
    • ))}
    + ) : null} - {/* Weaknesses */} -
    -

    - Trade-offs -

    -
      - {role.weaknesses.map((item) => ( -
    • - - {item} + {selectedOutcome.builderProfile?.howItWorks ? ( +
      +

      + Why it works +

      +
        + {selectedOutcome.whyItWorks.map((line) => ( +
      • + + {line}
      • ))}
      -
    + ) : null} +
- {/* Bottom stats + CTA */} -
-
- - - {candidateCount} candidates - - - - {exerciseCount.toLocaleString()} exercises +
+
+
+ + Optimize for + {OPTIMIZATION_MODES.map((mode) => { + const isSelected = mode.id === selectedMode + return ( + + ) + })}
+ + Capability set + +
- - View Candidates - - +
+
+

+ Agents +

+

+ Click for candidates & settings +

+
+
+ {agentCapabilities.map((capability) => { + const roleId = capability.roleId! + const rec = recByRole.get(roleId) + const candidate = getModeCandidate(rec, selectedMode) + + return ( + +
+
+

+ {capability.name} +

+ {capability.description ? ( +

+ {capability.description} +

+ ) : null} +

+ {candidate ? ( + <> + + {getModeLabel(selectedMode)}: + {" "} + {candidate.displayName} + + ) : ( + + View models + + )} +

+
+ +
+ + ) + })} +
+ + {builtInCapabilities.length > 0 ? ( +
+

+ Built-ins +

+
+ {builtInCapabilities.map((capability) => ( +
+
+ +
+
+

+ {capability.name} +

+ + Built-in + +
+

+ {capability.description} +

+
+
+
+ ))} +
+
+ ) : null}
- ) - })} + ) : null} + +
+
+ ) : null} + + {/* ── Role Cards Grid (baseline only) ────────────────────────── */} + {!enableOutcomeLayer ? ( +
+ {/* Subtle section background */} + +
+
+
-
-
+ +
+ {/* Section connector */} + +

+ Choose a setup for the work +

+ +
+ + + {roles.map((role) => { + const rec = recByRole.get(role.id) + const IconComponent = ICON_MAP[role.icon] ?? Code + const candidateCount = rec?.allCandidates.length ?? 0 + const exerciseCount = rec?.totalExercises ?? 0 + const theme = ROLE_THEMES[role.id] ?? DEFAULT_THEME + const topModel = rec?.best[0] ?? null + + return ( + +
+ {/* Subtle glow on hover */} +
+ +
+ {/* Header: Icon + role badge */} +
+
+ +
+ {topModel && ( + + Top: {topModel.displayName} + + )} +
+ + {/* Profile name + descriptor */} +

{role.name}

+

+ {role.salaryRange} +

+ + {/* Description */} +

+ {role.description} +

+ + {/* Best for */} +
+

+ Best for +

+
+ {role.bestFor.map((item) => ( + + {item} + + ))} +
+
+ + {/* Strengths & Weaknesses side by side */} +
+ {/* Strengths */} +
+

+ Strengths +

+
    + {role.strengths.map((item) => ( +
  • + + {item} +
  • + ))} +
+
+ + {/* Weaknesses */} +
+

+ Trade-offs +

+
    + {role.weaknesses.map((item) => ( +
  • + + {item} +
  • + ))} +
+
+
+ + {/* Bottom stats + CTA */} +
+
+ + + {candidateCount} models + + + + {exerciseCount.toLocaleString()} exercises + +
+ + + View models + + +
+
+
+ + ) + })} + +
+
+ ) : null} {/* ── AI Coding Capability Over Time ─────────────────────────── */}
@@ -582,7 +1021,7 @@ export function WorkersContent({ variants={containerVariants}> {/* Section header */} -

+

AI Coding Capability{" "} Over Time diff --git a/apps/web-roo-code/src/lib/eval-outcomes.ts b/apps/web-roo-code/src/lib/eval-outcomes.ts new file mode 100644 index 00000000000..85ced98347e --- /dev/null +++ b/apps/web-roo-code/src/lib/eval-outcomes.ts @@ -0,0 +1,171 @@ +import type { LucideIcon } from "lucide-react" +import { Bug, CheckCircle2, GitPullRequest, Sparkles, Workflow } from "lucide-react" + +export type EvalOutcomeId = + | "prototype_to_pr" + | "paper_cuts" + | "sentry_triage" + | "repro_to_fix" + | "review_guardrails" + | "issue_to_pr" + +export type EvalOutcomeCapability = { + id: string + name: string + description: string + /** + * Optional roleId for capabilities that map directly to a role page. + * Non-role capabilities represent Roo Code Cloud behaviors (validation, PR packaging, etc.). + */ + roleId?: string +} + +export type EvalOutcomeProfile = { + title: string + description: string + capabilities: EvalOutcomeCapability[] + howItWorks: string[] +} + +export type EvalOutcome = { + id: EvalOutcomeId + name: string + description: string + icon: LucideIcon + /** + * Ordered list of roleIds to suggest as a "setup". + * Keep roleIds stable even if display names evolve. + */ + recommendedRoleIds: string[] + whyItWorks: string[] + /** + * Optional profile details used to render a more comprehensive “exoskeleton” + * for an outcome. Start with the most important outcomes and expand over time. + */ + builderProfile?: EvalOutcomeProfile +} + +export const EVAL_OUTCOMES: EvalOutcome[] = [ + { + id: "prototype_to_pr", + name: "Prototype → PR", + description: "Build a working prototype on the production codebase, then turn it into a reviewable diff.", + icon: Sparkles, + recommendedRoleIds: ["senior", "reviewer"], + whyItWorks: [ + "Multi-file changes with a reviewer pass for coherence and edge cases.", + "Optimizes for shipping, not slides.", + ], + builderProfile: { + title: "Your Builder Profile", + description: + "A default set of capabilities for turning a working prototype into a reviewable PR—on the production codebase.", + capabilities: [ + { + id: "multi_file_builder", + name: "Multi-file Builder", + description: "Builds the prototype directly in your repo across the files it touches.", + roleId: "senior", + }, + { + id: "reviewer_guardrails", + name: "Reviewer & Guardrails", + description: "Reviews the diff for correctness, edge cases, and coherence before you merge.", + roleId: "reviewer", + }, + { + id: "environment_setup", + name: "Environment setup", + description: + "Bootstraps a working dev environment and runs the workflow without you fighting Git, installs, or tests.", + }, + { + id: "validation_loop", + name: "Validation loop", + description: "Runs tests/lint/typechecks and iterates until it’s clean (or flags what’s blocked).", + }, + { + id: "pr_ready_output", + name: "PR-ready output", + description: "Produces a focused diff plus a plain-English summary and review notes.", + }, + { + id: "straight_line_merge", + name: "Straight-line to merge", + description: + "No export/import step: the work is already on the production codebase, so merge is a straight line.", + }, + { + id: "scope_control", + name: "Scope control", + description: "Keeps diffs tight: smaller review surface, fewer surprises, and easier merges.", + }, + ], + howItWorks: [ + "Build a working prototype directly in the production codebase.", + "Convert the prototype into a tight diff (tests, cleanup, and safeguards).", + "Run a reviewer pass to catch edge cases and improve merge confidence.", + "Deliver a PR-ready result with context and next steps.", + ], + }, + }, + { + id: "paper_cuts", + name: "Paper cuts & small fixes", + description: "Fix the small stuff without dragging engineers off big projects.", + icon: CheckCircle2, + recommendedRoleIds: ["junior", "reviewer"], + whyItWorks: [ + "Small diffs are high-leverage when the work is well-scoped.", + "Reviewer keeps the quality bar and reduces surprise.", + ], + }, + { + id: "sentry_triage", + name: "Sentry triage", + description: "Turn recurring errors into concrete fixes with proof before review.", + icon: Bug, + recommendedRoleIds: ["autonomous", "reviewer"], + whyItWorks: [ + "Autonomous runs handle multi-step investigation and iteration.", + "Reviewer focuses on safety, correctness, and “does this hold up?”.", + ], + }, + { + id: "repro_to_fix", + name: "Bug repro → fix", + description: "Make the handoff less lossy: reproduce, patch, and validate in one loop.", + icon: Workflow, + recommendedRoleIds: ["senior", "reviewer"], + whyItWorks: [ + "Good default for ambiguous bugs that touch a few files.", + "Reviewer helps catch cross-team assumptions early.", + ], + }, + { + id: "review_guardrails", + name: "Guardrails & review", + description: "Raise the quality bar without becoming the blocker.", + icon: GitPullRequest, + recommendedRoleIds: ["reviewer"], + whyItWorks: [ + "Works alongside CI, linters, and team review.", + "Scales judgement through fast, consistent feedback.", + ], + }, + { + id: "issue_to_pr", + name: "Issue → PR", + description: "Run end-to-end work in the background and come back to a reviewable result.", + icon: GitPullRequest, + recommendedRoleIds: ["autonomous", "reviewer"], + whyItWorks: [ + "Handles out-of-band work while humans stay on the roadmap.", + "Pairs autonomy with guardrails for merge safety.", + ], + }, +] + +export function isEvalOutcomeId(value: string): value is EvalOutcomeId { + return EVAL_OUTCOMES.some((o) => o.id === value) +} diff --git a/apps/web-roo-code/src/lib/mock-recommendations.ts b/apps/web-roo-code/src/lib/mock-recommendations.ts index 99cc7f9b800..9a08487ca7d 100644 --- a/apps/web-roo-code/src/lib/mock-recommendations.ts +++ b/apps/web-roo-code/src/lib/mock-recommendations.ts @@ -1,7 +1,7 @@ // --------------------------------------------------------------------------- // Eval Recommendations: Types + Mock Data (S1.1a) // --------------------------------------------------------------------------- -// This file defines the API contract for the AI Engineer Talent Marketplace. +// This file defines the API contract for the /evals/workers recommendation pages. // The backend (Sprint 3-4) will produce data matching these exact types. // --------------------------------------------------------------------------- @@ -16,11 +16,11 @@ export const TASKS_PER_DAY = 80 // ── Types ────────────────────────────────────────────────────────────────── -/** Engineer role definition: maps task complexity to a hiring tier. */ +/** Engineer role definition: maps task complexity to a recommendation tier. */ export type EngineerRole = { id: string name: string - /** Daily salary range string, e.g. "$3–38/day" */ + /** Short descriptor shown under the profile name (scope, mode, etc.). */ salaryRange: string description: string bestFor: string[] @@ -80,51 +80,51 @@ export type RoleRecommendation = { const ENGINEER_ROLES: EngineerRole[] = [ { id: "junior", - name: "Junior Engineer", - salaryRange: "$2–10/day", + name: "Single-file Builder", + salaryRange: "Scope: single-file", description: - "Handles well-scoped, single-file tasks: boilerplate, simple bug fixes, and test generation at the lowest cost per task.", - bestFor: ["Single-file fixes", "Boilerplate generation", "Test generation", "Simple implementations"], - strengths: ["Cheap", "High throughput", "Best cost-to-quality ratio on simple tasks"], - weaknesses: ["Struggles with multi-file changes", "Limited reasoning depth", "May miss edge cases"], + "Best for tight diffs: boilerplate, small fixes, and test updates. Great when the work is clear and bounded.", + bestFor: ["Small fixes", "Boilerplate", "Test updates", "Simple implementations"], + strengths: ["Fast iteration", "Stays close to the requested change", "Great for well-scoped diffs"], + weaknesses: ["Not ideal for cross-cutting work", "Can miss edge cases in complex systems"], icon: "Code", }, { id: "senior", - name: "Senior Engineer", - salaryRange: "$10–26/day", + name: "Multi-file Builder", + salaryRange: "Scope: multi-file", description: - "The sweet spot for most engineering work. Senior-tier models balance cost and quality across multi-file refactors, feature development, and debugging.", - bestFor: ["Multi-file refactors", "Feature development", "Debugging", "Code review"], + "For most day-to-day shipping: feature work across a few files, refactors, and debugging with solid consistency.", + bestFor: ["Feature work", "Multi-file refactors", "Debugging", "Integrations"], strengths: [ - "Balanced cost/quality", - "Handles multi-file changes and cross-cutting refactors", - "Consistent pass rates across all five languages", + "Reliable for common product work", + "Handles multi-file changes and dependencies", + "Consistent across all five languages", ], - weaknesses: ["More expensive than junior", "Overkill for trivial tasks"], + weaknesses: ["Overkill for trivial diffs", "May need help on cross-cutting architecture"], icon: "GitBranch", }, { id: "staff", - name: "Staff Engineer", - salaryRange: "$8–34/day", + name: "Architecture & Refactor", + salaryRange: "Scope: cross-cutting", description: - "For architecture decisions, system design, and complex refactors. Staff-tier models handle ambiguous requirements and cross-cutting changes where other tiers fail.", - bestFor: ["Architecture decisions", "Complex features", "System design", "Ambiguous requirements"], + "For ambiguity and cross-cutting changes: architecture decisions, complex refactors, and work where correctness matters more than speed.", + bestFor: ["Complex refactors", "Architecture changes", "Ambiguous requirements", "System design"], strengths: [ - "Handles multi-step reasoning and ambiguous specs", - "Passes existing test suites consistently", - "Resolves underspecified requirements", + "Strong multi-step reasoning", + "Good at navigating bigger codebases", + "Better at making safe, coherent changes", ], - weaknesses: ["Most expensive", "Overkill for simple tasks", "Diminishing returns on easy work"], + weaknesses: ["Overkill for simple diffs", "Still needs human review before merge"], icon: "Building2", }, { id: "reviewer", - name: "Architecture Reviewer", - salaryRange: "$15–40/day", + name: "Reviewer & Guardrails", + salaryRange: "Mode: review", description: - "For code review, PR feedback, security analysis, and design critique. Reviewer-tier models catch issues other models miss and provide actionable, context-aware suggestions.", + "For PR feedback, security review, and design critique. Use this to improve quality and reduce surprises before merge.", bestFor: ["Code review", "PR feedback", "Security analysis", "Design critique", "Refactor guidance"], strengths: [ "Catches subtle bugs and logic errors", @@ -132,18 +132,18 @@ const ENGINEER_ROLES: EngineerRole[] = [ "Understands cross-file impact of changes", ], weaknesses: [ - "Not for writing code from scratch", - "More expensive than running linters", + "Not for writing features end-to-end", + "Not a replacement for CI and linters", "Review quality varies by codebase size", ], icon: "Search", }, { id: "autonomous", - name: "Autonomous Agent", - salaryRange: "$5–30/day", + name: "Autonomous Delivery", + salaryRange: "Mode: end-to-end", description: - "For issue-to-PR workflows, long-running tasks, and multi-step debugging with minimal supervision. Autonomous-tier models complete tasks end-to-end and recover from errors without human intervention.", + "For issue-to-PR workflows and long-running tasks. Best when you want an agent to run, iterate, and bring back a reviewable result.", bestFor: [ "Issue-to-PR workflows", "Multi-step debugging", From d91c035e9996a2f5e0a45e02712623ad7382f838 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Thu, 12 Feb 2026 19:20:02 -0800 Subject: [PATCH 11/22] Make workers outcomes-first canonical; redirect v2; refresh methodology --- .../evals/methodology/methodology-content.tsx | 1254 ++++++----------- .../src/app/evals/methodology/page.tsx | 7 +- .../workers-v2/[roleId]/compare/page.tsx | 85 +- .../app/evals/workers-v2/[roleId]/page.tsx | 102 +- .../app/evals/workers-v2/_redirect-utils.ts | 11 + .../src/app/evals/workers-v2/page.tsx | 93 +- .../workers/[roleId]/candidates-content.tsx | 8 - .../[roleId]/compare/comparison-chart.tsx | 8 - .../workers/[roleId]/copy-settings-button.tsx | 2 +- .../src/app/evals/workers/page.tsx | 41 +- .../src/app/evals/workers/workers-content.tsx | 1168 ++++++++------- apps/web-roo-code/src/lib/eval-outcomes.ts | 342 ++++- .../src/lib/mock-recommendations.ts | 16 +- .../src/lib/objective-default-models-v1.ts | 227 +++ 14 files changed, 1593 insertions(+), 1771 deletions(-) create mode 100644 apps/web-roo-code/src/app/evals/workers-v2/_redirect-utils.ts create mode 100644 apps/web-roo-code/src/lib/objective-default-models-v1.ts diff --git a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx index 654b1740feb..852356c6c80 100644 --- a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx +++ b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx @@ -1,931 +1,517 @@ "use client" import { motion } from "framer-motion" -import { - ArrowRight, - FlaskConical, - Code, - AlertTriangle, - BarChart3, - Terminal, - ExternalLink, - CheckCircle2, - Beaker, - Timer, - DollarSign, - Zap, - Trophy, - Scale, -} from "lucide-react" +import { AlertTriangle, CheckCircle2, Scale, Timer, Zap } from "lucide-react" import Link from "next/link" -// ── Framer Motion Variants ────────────────────────────────────────────────── - const containerVariants = { hidden: { opacity: 0 }, visible: { opacity: 1, transition: { staggerChildren: 0.12, - delayChildren: 0.1, + delayChildren: 0.08, }, }, } const fadeUpVariants = { - hidden: { opacity: 0, y: 20 }, + hidden: { opacity: 0, y: 18 }, visible: { opacity: 1, y: 0, - transition: { - duration: 0.6, - ease: [0.21, 0.45, 0.27, 0.9] as const, - }, + transition: { duration: 0.55, ease: [0.21, 0.45, 0.27, 0.9] as const }, }, } const backgroundVariants = { hidden: { opacity: 0 }, - visible: { - opacity: 1, - transition: { - duration: 1.2, - ease: "easeOut" as const, - }, - }, + visible: { opacity: 1, transition: { duration: 1.1, ease: "easeOut" as const } }, } -const cardVariants = { - hidden: { opacity: 0, y: 30 }, - visible: { - opacity: 1, - y: 0, - transition: { - duration: 0.6, - ease: [0.21, 0.45, 0.27, 0.9] as const, - }, - }, +function InlineArrow() { + return ( + + → + + ) } -// ── Section Number Marker ─────────────────────────────────────────────────── - -function SectionNumber({ num }: { num: string }) { +function Chip({ icon: Icon, label }: { icon: React.ComponentType<{ className?: string }>; label: string }) { return ( - - {num} + + + {label} ) } -// ── Process Step Icon ─────────────────────────────────────────────────────── - -function ProcessStep({ +function Callout({ icon: Icon, - label, - isLast, + title, + body, + tone = "neutral", }: { icon: React.ComponentType<{ className?: string }> - label: string - isLast?: boolean + title: string + body: string + tone?: "neutral" | "warning" | "success" }) { + const toneClasses = + tone === "warning" + ? "border-amber-500/20 bg-amber-500/5" + : tone === "success" + ? "border-emerald-500/20 bg-emerald-500/5" + : "border-border/50 bg-card/40" + return ( -
-
-
+
+
+
- {label} -
- {!isLast && ( -
-
- +
+

{title}

+

{body}

- )} +
) } -// ── Language Card ─────────────────────────────────────────────────────────── - -function LanguageCard({ name, color }: { name: string; color: string }) { +function Step({ num, title, body }: { num: string; title: string; body: string }) { return ( -
-
- {name.slice(0, 2).toUpperCase()} +
+ + {num} + +
+

{title}

+

{body}

- {name}
) } -// ── Scoring Bar Component ─────────────────────────────────────────────────── - -function ScoringBar({ - label, - icon: Icon, - color, - bgColor, - weight, - description, -}: { - label: string - icon: React.ComponentType<{ className?: string }> - color: string - bgColor: string - weight: number - description: string -}) { +function SmallLink({ href, label }: { href: string; label: string }) { return ( -
-
- -
-
-
-

{label}

- {weight}% -
-

{description}

-
- -
-
-
+ + {label} + ) } -// ── Main Content Component ────────────────────────────────────────────────── - export function MethodologyContent() { return ( <> - {/* ════════════════════════════════════════════════════════════════ - HERO SECTION - ════════════════════════════════════════════════════════════════ */} -
- {/* Atmospheric blur background */} + {/* Hero */} +
+ variants={backgroundVariants} + className="absolute inset-0"> +
-
-
-
+
+
+
+
-
+
- {/* Breadcrumb */} - - - Evals - - / - - Build with Roo Code Cloud - - / - Methodology - - - {/* Heading */} - - How We Run{" "} - - Evals - - - - {/* Subtitle */} - - Same exercises, same environment, same scoring for every model. Every step is documented and - every eval run is reproducible. - - - {/* Pill badge links */} - - - - View recommendations - - - - - Raw eval data - - - - -
-
- - {/* ════════════════════════════════════════════════════════════════ - SECTION 01: THE INTERVIEW PROCESS - ════════════════════════════════════════════════════════════════ */} - -
- - - - - - The Eval Process - - - -

- We don't test models in isolation. We test them as they work inside Roo Code. Each - model gets the same exercises, same time limit, same tools. We measure what matters. -

-
- - {/* Process flow */} - - - - - - - - {/* Key principles */} - - {[ - { - title: "Identical Environment", - desc: "Docker container with VS Code, Roo Code extension, and a fresh workspace per exercise.", - }, - { - title: "No Cherry-Picking", - desc: "Every model gets the exact same eval run. No curated demos, no special treatment.", - }, - { - title: "Real Metrics", - desc: "Does it pass the tests? How much does it cost? How fast does it deliver?", - }, - ].map((item) => ( - -

{item.title}

-

{item.desc}

-
- ))} -
-
-
- - {/* ════════════════════════════════════════════════════════════════ - SECTION 02: THE EVAL SUITE - ════════════════════════════════════════════════════════════════ */} - - {/* Subtle background glow */} - -
-
-
- - -
- - - - - - The Eval Suite - - - - Hundreds of coding exercises across 5 languages and{" "} - 3 difficulty tiers. From single-file fixes to - complex architecture decisions. - - - {/* Language cards */} - - - - - - - - - - - - - - - - - - - {/* Difficulty tiers */} - -

- Difficulty Tiers -

-
- {/* Easy */} -
-
- E -
-
-

Easy

-

- Single-file fixes, straightforward implementations, basic debugging -

-
-
- - 90–95% - -
- -
-
-
- {/* Medium */} -
-
- M -
-
-

Medium

-

- Multi-file changes, refactoring, cross-file understanding -

-
-
- - 60–80% - -
- -
-
+ variants={containerVariants} + className="mx-auto max-w-6xl"> +
+
+ + + Evals + + / + + Recommendations + + / + Methodology + + + + Read this before you compare models + + + + How we run evals + + + + We keep tasks, environment, and scoring constant across models. Use our results to + pick a default for a specific objective, then validate in your repo. + + + + + + + + + + + View recommendations + + + Raw eval data + + + + + Jump to: + + What we hold constant + + + Scoring + + + Limitations + +
- {/* Hard */} -
-
- H -
-
-

Hard

-

- Architecture decisions, ambiguous requirements, complex system design -

-
-
- - 30–50% - -
- + + +
+
+ +
+

+ Methodology at a glance +

+

+ Comparable results, not universal truth +

+ +
+
+

+ We hold constant +

+

+ Same exercises, same tools, same time limit, same scoring. +

+
+
+

+ We measure +

+

+ Pass rate, latency, and cost signals across multiple languages. +

+
+
+

+ We recommend +

+

+ A default model and agent lineup for an objective. It’s a + baseline, not a guarantee. +

+
+
+ +
+ + Objective-first + + + Optimized for: Quality / Speed / Cost + + + Validate in your repo + +
-
+
- - - {/* ════════════════════════════════════════════════════════════════ - SECTION 03: ENGINEER ROLES - ════════════════════════════════════════════════════════════════ */} - -
- - - - - - Engineer Roles - +
+ {/* Body */} +
+
-

- Each role represents a different engineering seniority level. We test models against - exercises matched to that role's complexity, then score using role-specific weights. -

-
- - {/* How weights differ */} - - -

Different Roles, Different Weights

-

- Each role has its own scoring weights. A model that's great for simple tasks might - not rank for architecture decisions. -

-
- -

Matched Exercises

-

- Budget roles get simpler exercises. Complex roles get harder ones. The difficulty and - scoring shift together so recommendations stay relevant. -

-
-
+ initial="hidden" + whileInView="visible" + viewport={{ once: true, margin: "-120px" }} + variants={containerVariants} + className="mx-auto grid max-w-6xl grid-cols-1 gap-10 lg:grid-cols-12"> + {/* Left rail */} + +
+

+ How to read results responsibly +

+

+ Evals help you pick a better default. They don’t predict how a model behaves + in your repo, with your tests, tooling, and constraints. +

- {/* Budget vs Complex comparison */} - -

- How Scoring Weights Shift -

-
- {/* Budget roles */} -
-
- +
+ +
-
-

Budget Roles

-

- Cost and speed matter most. Simpler exercises where many models succeed, so - efficiency breaks the tie. + +

-
-
-
-
- - {/* Complex roles */} -
-
- -
-
-

Complex Roles

-

- Reasoning quality and success rate matter most. Harder exercises where only the - best models deliver. + + +

+

+ Quick definitions

-
-
-
-
-
-
-
-
-
- - Success ↑ - - - Quality ↑ - - Cost - Speed -
+
    +
  • + + + Objective: + the workflow you want to ship (for example,{" "} + Issue → PR). + +
  • +
  • + + + + Optimized for + + : the tradeoff you care about most ( + Quality,{" "} + Speed,{" "} + Cost). + +
  • +
  • + + + Pass rate: + percent of exercises a model completes within the limit. + +
  • +
-
- - - {/* Link to roles page */} - - - Browse all engineer roles - - - -
- - - {/* ════════════════════════════════════════════════════════════════ - SECTION 04: SCORING - ════════════════════════════════════════════════════════════════ */} - - {/* Background glow */} - -
-
-
- - -
- - - - - - Scoring - - - - Each model receives a composite score, a weighted - sum of four dimensions normalized to a 0–100 scale. - - - {/* Scoring formula components */} - - - - - - - - - - - - - - - - {/* Tier classification */} - -

- Recommendation Tiers -

-

- Composite scores are mapped to recommendation tiers: -

-
- - - {/* Best */} - -
- ≥85 -
-
- - Best - -

Top Performer

-

Highly recommended for this role.

-
- {/* Recommended */} - -
- 70–84 -
-
- - Recommended - -

Solid Choice

-

- Reliable for most tasks at this level. + {/* Main column */} +

+ +

+ Methodology

-
- - - {/* Situational */} - -
- 50–69 -
-
- - Situational - -

Usable with Caveats

-

May struggle in specific areas.

-
-
+

+ How to use these evals +

+

+ The recommendations page is organized around what you’re trying to ship. You + pick the objective and tradeoff. We show the best default setup based on the signal + we have. +

+ - {/* Not Recommended */} - -
- <50 -
-
- - Not Recommended - -

High Failure Rate

-

Not suitable for this role.

+
+ + + + + + + + +
- - - - Per-language breakdowns reveal where each model excels or struggles. A model might score well - overall but underperform in Rust, or dominate in Python but lag in Go. - -
- - - {/* ════════════════════════════════════════════════════════════════ - SECTION 05: RUN YOUR OWN EVALS - ════════════════════════════════════════════════════════════════ */} - -
- - - - - - Run Your Own Evals - - - - Our evaluation framework is fully open source. Run the exact same evals on your own - infrastructure, with your own API keys, against any model. - - - {/* Terminal card */} - - {/* Terminal header */} -
-
-
-
-
-
-
- - terminal -
-
- {/* Terminal body */} -
-
- ${" "} - git clone{" "} - https://github.com/RooCodeInc/Roo-Code-Evals.git -
-
- $ cd{" "} - Roo-Code-Evals -
-
- ${" "} - # Follow the README for setup instructions -
-
- + +

+ Scoring and signals +

+

+ What we measure +

+
+ + + + +
+
- {/* GitHub link */} - - - - - - View on GitHub - - - -
- - - {/* ════════════════════════════════════════════════════════════════ - SECTION 06: LIMITATIONS - ════════════════════════════════════════════════════════════════ */} - -
- - - + +

+ Tradeoffs +

+

+ Quality, speed, cost: pick one to optimize +

+
+

+ Choosing an optimization mode is how you tell the system what matters most for + your objective. If you care about merge confidence, optimize for Quality. If you + care about throughput, Speed and Cost matter. +

+

+ When two models are close on pass rate, the most practical tie-breakers are + latency and $/task. +

+
+
- - Limitations - - - - Every evaluation has blind spots. These are ours. - - - - {[ - { - title: "Single test environment", - description: - "All evals run in Docker + VS Code. Results may differ in other IDEs or environments.", - }, - { - title: "Expanding exercise coverage", - description: - "Hundreds of exercises, but the suite is continuously growing. Some niche patterns may be underrepresented.", - }, - { - title: "API changes affect results", - description: - "Providers update their models. A model that scored well last month may behave differently after an update.", - }, - { - title: "Point-in-time snapshots", - description: - 'Each eval run captures performance at a specific point. We re-run regularly; check the "last updated" date.', - }, - ].map((item) => ( - - -
-

{item.title}

-

- {item.description} + +

+ Limitations +

+

+ What these evals don’t tell you +

+
+

+ A model can score well on this suite and still struggle in your repo because + your stack, tests, dependencies, and CI constraints are different. +

+

+ The right move is to treat our results as a starting point, then run your + objective end-to-end in Roo Code Cloud and inspect the PR output.

- - ))} - -
- + - {/* ════════════════════════════════════════════════════════════════ - BOTTOM NAVIGATION - ════════════════════════════════════════════════════════════════ */} -
-
- - - Ready to see the results? - - - - - View recommendations - - - - - Raw eval data - - - + +

+ Links +

+
+ + +
+
+

+ If something feels off in the recommendations, that’s a signal too. The + fastest path is to run your objective in Roo Code Cloud and compare the PR + output. +

+
+
+
diff --git a/apps/web-roo-code/src/app/evals/methodology/page.tsx b/apps/web-roo-code/src/app/evals/methodology/page.tsx index 039d7e6fd75..e723cabd674 100644 --- a/apps/web-roo-code/src/app/evals/methodology/page.tsx +++ b/apps/web-roo-code/src/app/evals/methodology/page.tsx @@ -7,9 +7,10 @@ import { MethodologyContent } from "./methodology-content" // ── SEO Metadata ──────────────────────────────────────────────────────────── -const TITLE = "Methodology | Roo Code Evals" -const DESCRIPTION = "Our methodology for evaluating AI coding models. Transparent, reproducible, evidence-based." -const OG_DESCRIPTION = "Our methodology for evaluating AI coding models" +const TITLE = "Methodology | Roo Code Cloud Evals" +const DESCRIPTION = + "How we run Roo Code Cloud evals and how to interpret outcomes-first recommendations. Same tasks, same limits, clear tradeoffs." +const OG_DESCRIPTION = "How we run Roo Code Cloud evals" const PATH = "/evals/methodology" export const metadata: Metadata = { diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx index 2176eaae88b..ceb688d8a7c 100644 --- a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx @@ -1,83 +1,14 @@ -import { notFound } from "next/navigation" -import type { Metadata } from "next" +import { permanentRedirect } from "next/navigation" -import { SEO } from "@/lib/seo" -import { ogImageUrl } from "@/lib/og" -import { getEngineerRole, getRoleRecommendation } from "@/lib/mock-recommendations" +import { buildQueryString, type RedirectSearchParams } from "../../../_redirect-utils" -import { ComparisonChart } from "../../../workers/[roleId]/compare/comparison-chart" - -type PageProps = { params: Promise<{ roleId: string }> } - -export async function generateMetadata({ params }: PageProps): Promise { - const { roleId } = await params - const role = getEngineerRole(roleId) - - if (!role) { - return { - title: "Role Not Found | Roo Code Evals", - description: "The requested role was not found.", - } - } - - const title = `Compare Models — ${role.name} (V2 Preview) | Roo Code Evals` - const description = `Outcome-first comparison of AI models for ${role.name}. Compare composite score, success rate, cost efficiency, and speed.` - const ogDescription = `Compare Models — ${role.name} (V2 Preview)` - const path = `/evals/workers-v2/${roleId}/compare` - - return { - title, - description, - alternates: { - canonical: `${SEO.url}${path}`, - }, - openGraph: { - title, - description, - url: `${SEO.url}${path}`, - siteName: SEO.name, - images: [ - { - url: ogImageUrl(title, ogDescription), - width: 1200, - height: 630, - alt: title, - }, - ], - locale: SEO.locale, - type: "website", - }, - twitter: { - card: SEO.twitterCard, - title, - description, - images: [ogImageUrl(title, ogDescription)], - }, - keywords: [ - ...SEO.keywords, - "AI coding", - "model comparison", - "coding evals", - role.name.toLowerCase(), - "outcome-first", - ], - } +type PageProps = { + params: Promise<{ roleId: string }> + searchParams?: Promise } -export default async function WorkersV2ComparePage({ params }: PageProps) { +export default async function WorkersV2ComparePage({ params, searchParams }: PageProps) { const { roleId } = await params - const recommendation = getRoleRecommendation(roleId) - - if (!recommendation) { - notFound() - } - - return ( - - ) + const sp = (await searchParams) ?? {} + permanentRedirect(`/evals/workers/${roleId}/compare${buildQueryString(sp)}`) } diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx index 8afef01a589..affc2355154 100644 --- a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx @@ -1,100 +1,14 @@ -import { notFound } from "next/navigation" -import type { Metadata } from "next" +import { permanentRedirect } from "next/navigation" -import { SEO } from "@/lib/seo" -import { ogImageUrl } from "@/lib/og" -import { getRoleRecommendation, getCloudSetupUrl } from "@/lib/mock-recommendations" +import { buildQueryString, type RedirectSearchParams } from "../../_redirect-utils" -import { CandidatesContent } from "../../workers/[roleId]/candidates-content" - -type PageProps = { params: Promise<{ roleId: string }> } - -export async function generateMetadata({ params }: PageProps): Promise { - const { roleId } = await params - const recommendation = getRoleRecommendation(roleId) - - if (!recommendation) { - return { - title: "Role Not Found | Roo Code Evals", - description: "The requested role was not found.", - } - } - - const { role } = recommendation - const title = `${role.name} — Recommended Models (V2 Preview) | Roo Code Evals` - const description = `Outcome-first recommendations for ${role.name}. Compare models by success rate, cost, and speed across 5 languages.` - const ogDescription = `${role.name} — Recommended Models (V2 Preview)` - const path = `/evals/workers-v2/${roleId}` - - return { - title, - description, - alternates: { - canonical: `${SEO.url}${path}`, - }, - openGraph: { - title, - description, - url: `${SEO.url}${path}`, - siteName: SEO.name, - images: [ - { - url: ogImageUrl(title, ogDescription), - width: 1200, - height: 630, - alt: title, - }, - ], - locale: SEO.locale, - type: "website", - }, - twitter: { - card: SEO.twitterCard, - title, - description, - images: [ogImageUrl(title, ogDescription)], - }, - keywords: [ - ...SEO.keywords, - "AI coding", - "coding agents", - "model recommendations", - "coding evals", - role.name.toLowerCase(), - "outcome-first", - ], - } +type PageProps = { + params: Promise<{ roleId: string }> + searchParams?: Promise } -export default async function WorkersV2RolePage({ params }: PageProps) { +export default async function WorkersV2RolePage({ params, searchParams }: PageProps) { const { roleId } = await params - const recommendation = getRoleRecommendation(roleId) - - if (!recommendation) { - notFound() - } - - const { role, best, budgetHire, speedHire, allCandidates, totalEvalRuns, totalExercises, lastUpdated } = - recommendation - - const cloudUrls: Record = {} - for (const candidate of allCandidates) { - cloudUrls[candidate.modelId] = getCloudSetupUrl(candidate) - } - - return ( - - ) + const sp = (await searchParams) ?? {} + permanentRedirect(`/evals/workers/${roleId}${buildQueryString(sp)}`) } diff --git a/apps/web-roo-code/src/app/evals/workers-v2/_redirect-utils.ts b/apps/web-roo-code/src/app/evals/workers-v2/_redirect-utils.ts new file mode 100644 index 00000000000..a8884e591a6 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/workers-v2/_redirect-utils.ts @@ -0,0 +1,11 @@ +export type RedirectSearchParams = Record + +export function buildQueryString(searchParams: RedirectSearchParams): string { + const params = new URLSearchParams() + for (const [key, value] of Object.entries(searchParams)) { + if (typeof value === "string") params.set(key, value) + else if (Array.isArray(value)) value.forEach((v) => params.append(key, v)) + } + const qs = params.toString() + return qs ? `?${qs}` : "" +} diff --git a/apps/web-roo-code/src/app/evals/workers-v2/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx index 5196214f680..e8559e09ca9 100644 --- a/apps/web-roo-code/src/app/evals/workers-v2/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx @@ -1,91 +1,12 @@ -import type { Metadata } from "next" -import { Fraunces, IBM_Plex_Sans } from "next/font/google" +import { permanentRedirect } from "next/navigation" -import { SEO } from "@/lib/seo" -import { ogImageUrl } from "@/lib/og" -import { getEngineerRoles, getAllRecommendations } from "@/lib/mock-recommendations" +import { buildQueryString, type RedirectSearchParams } from "./_redirect-utils" -import { WorkersContent } from "../workers/workers-content" - -const TITLE = "Build with Roo Code Cloud (V2 Preview) | Roo Code Evals" -const DESCRIPTION = - "Outcome-first, eval-backed recommendations for shipping production code. Start from what you need to ship and pick a setup." -const OG_DESCRIPTION = "Outcome-first recommendations for shipping production code" -const PATH = "/evals/workers-v2" - -const display = Fraunces({ subsets: ["latin"], variable: "--font-display" }) -const body = IBM_Plex_Sans({ subsets: ["latin"], weight: ["400", "500", "600"], variable: "--font-body" }) - -export const metadata: Metadata = { - title: TITLE, - description: DESCRIPTION, - alternates: { - canonical: `${SEO.url}${PATH}`, - }, - openGraph: { - title: TITLE, - description: DESCRIPTION, - url: `${SEO.url}${PATH}`, - siteName: SEO.name, - images: [ - { - url: ogImageUrl(TITLE, OG_DESCRIPTION), - width: 1200, - height: 630, - alt: TITLE, - }, - ], - locale: SEO.locale, - type: "website", - }, - twitter: { - card: SEO.twitterCard, - title: TITLE, - description: DESCRIPTION, - images: [ogImageUrl(TITLE, OG_DESCRIPTION)], - }, - keywords: [ - ...SEO.keywords, - "AI coding", - "coding agents", - "roo code cloud", - "model recommendations", - "coding evals", - "shipping code", - "prototype", - "outcome-first", - ], +type PageProps = { + searchParams?: Promise } -export default function WorkersV2Page() { - const roles = getEngineerRoles() - const recommendations = getAllRecommendations() - - const totalEvalRuns = recommendations.reduce((sum, recommendation) => sum + recommendation.totalEvalRuns, 0) - const totalExercises = recommendations.reduce((sum, recommendation) => sum + recommendation.totalExercises, 0) - const uniqueModels = new Set( - recommendations.flatMap((recommendation) => recommendation.allCandidates.map((candidate) => candidate.modelId)), - ) - const totalModels = uniqueModels.size - const lastUpdated = recommendations - .map((recommendation) => recommendation.lastUpdated) - .sort() - .pop() - - return ( -
- -
- ) +export default async function WorkersV2Page({ searchParams }: PageProps) { + const sp = (await searchParams) ?? {} + permanentRedirect(`/evals/workers${buildQueryString(sp)}`) } diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx index 9d48a2b0955..0b1105650f1 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx @@ -665,8 +665,6 @@ export function CandidatesContent({ const searchParams = useSearchParams() const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME const IconComponent = ICON_MAP[role.icon] ?? Code - const alternateWorkersRootPath = workersRootPath === "/evals/workers-v2" ? "/evals/workers" : "/evals/workers-v2" - const alternateVersionLabel = workersRootPath === "/evals/workers-v2" ? "View baseline" : "View V2 preview" const setupQuery = (() => { const outcome = searchParams.get("outcome") if (!outcome) return "" @@ -773,12 +771,6 @@ export function CandidatesContent({ Methodology -
- - {alternateVersionLabel} - {/* Strengths + Trade-offs grid */} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx index d03f83fe67b..8c005cdbd5f 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -446,8 +446,6 @@ export function ComparisonChart({ const searchParams = useSearchParams() const { allCandidates } = recommendation const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME - const alternateWorkersRootPath = workersRootPath === "/evals/workers-v2" ? "/evals/workers" : "/evals/workers-v2" - const alternateVersionLabel = workersRootPath === "/evals/workers-v2" ? "View baseline" : "View V2 preview" const setupQuery = (() => { const outcome = searchParams.get("outcome") if (!outcome) return "" @@ -593,12 +591,6 @@ export function ComparisonChart({ / Compare Models - / - - {alternateVersionLabel} - {/* Title row */} diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx index 06adacff5a2..b330ba76536 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/copy-settings-button.tsx @@ -34,7 +34,7 @@ export function CopySettingsButton({ settings }: CopySettingsButtonProps) { ) : ( <> - 🔧 Configure Extension + Copy Roo Code Cloud Config )} diff --git a/apps/web-roo-code/src/app/evals/workers/page.tsx b/apps/web-roo-code/src/app/evals/workers/page.tsx index a2b95a1bd88..718011b56dc 100644 --- a/apps/web-roo-code/src/app/evals/workers/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers/page.tsx @@ -1,4 +1,5 @@ import type { Metadata } from "next" +import { Fraunces, IBM_Plex_Sans } from "next/font/google" import { SEO } from "@/lib/seo" import { ogImageUrl } from "@/lib/og" @@ -10,10 +11,13 @@ import { WorkersContent } from "./workers-content" const TITLE = "Build with Roo Code Cloud | Roo Code Evals" const DESCRIPTION = - "Eval-backed model recommendations for shipping production code. Pick a setup based on the work you're doing: single-file fixes, multi-file changes, review, and autonomous runs." -const OG_DESCRIPTION = "Eval-backed model recommendations for shipping production code" + "Outcome-first, eval-backed recommendations for shipping production code. Start from your objective and pick a tradeoff." +const OG_DESCRIPTION = "Outcome-first recommendations for shipping production code" const PATH = "/evals/workers" +const display = Fraunces({ subsets: ["latin"], variable: "--font-display" }) +const body = IBM_Plex_Sans({ subsets: ["latin"], weight: ["400", "500", "600"], variable: "--font-body" }) + export const metadata: Metadata = { title: TITLE, description: DESCRIPTION, @@ -62,11 +66,11 @@ export default function WorkersPage() { const recommendations = getAllRecommendations() // Aggregate totals - const totalEvalRuns = recommendations.reduce((sum, r) => sum + r.totalEvalRuns, 0) - const totalExercises = recommendations.reduce((sum, r) => sum + r.totalExercises, 0) - - // Unique model count across all roles - const uniqueModels = new Set(recommendations.flatMap((r) => r.allCandidates.map((c) => c.modelId))) + const totalEvalRuns = recommendations.reduce((sum, recommendation) => sum + recommendation.totalEvalRuns, 0) + const totalExercises = recommendations.reduce((sum, recommendation) => sum + recommendation.totalExercises, 0) + const uniqueModels = new Set( + recommendations.flatMap((recommendation) => recommendation.allCandidates.map((candidate) => candidate.modelId)), + ) const totalModels = uniqueModels.size const lastUpdated = recommendations @@ -75,17 +79,16 @@ export default function WorkersPage() { .pop() return ( - +
+ +
) } diff --git a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx index e5c770ea6da..9189190120d 100644 --- a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx @@ -2,23 +2,7 @@ import { useCallback, useMemo } from "react" import { motion } from "framer-motion" -import { - Code, - GitBranch, - Building2, - Search, - Bot, - ArrowRight, - ChevronDown, - CheckCircle2, - AlertTriangle, - Users, - FlaskConical, - Beaker, - Globe, - TrendingUp, -} from "lucide-react" -import type { LucideIcon } from "lucide-react" +import { ArrowRight, FlaskConical, Beaker } from "lucide-react" import Link from "next/link" import { usePathname, useRouter, useSearchParams } from "next/navigation" import { ScatterChart, Scatter, XAxis, YAxis, ZAxis, Tooltip, ResponsiveContainer, Cell, ReferenceLine } from "recharts" @@ -26,120 +10,7 @@ import { ScatterChart, Scatter, XAxis, YAxis, ZAxis, Tooltip, ResponsiveContaine import type { EngineerRole, RoleRecommendation } from "@/lib/mock-recommendations" import { TASKS_PER_DAY, MODEL_TIMELINE } from "@/lib/mock-recommendations" import { EVAL_OUTCOMES, isEvalOutcomeId, type EvalOutcomeId } from "@/lib/eval-outcomes" - -// ── Icon Mapping ──────────────────────────────────────────────────────────── - -const ICON_MAP: Record = { - Code, - GitBranch, - Building2, - Search, - Bot, -} - -// ── Color Themes per Role ─────────────────────────────────────────────────── - -type RoleTheme = { - accent: string - accentLight: string - accentDark: string - iconBg: string - iconText: string - badgeBg: string - badgeText: string - borderHover: string - shadowHover: string - buttonBg: string - buttonHover: string - glowColor: string - dotColor: string - strengthColor: string -} - -const ROLE_THEMES: Record = { - junior: { - accent: "emerald", - accentLight: "text-emerald-600", - accentDark: "dark:text-emerald-400", - iconBg: "bg-emerald-100 dark:bg-emerald-900/30", - iconText: "text-emerald-700 dark:text-emerald-300", - badgeBg: "bg-emerald-100 dark:bg-emerald-900/30", - badgeText: "text-emerald-700 dark:text-emerald-300", - borderHover: "hover:border-emerald-500/40 dark:hover:border-emerald-400/30", - shadowHover: "hover:shadow-emerald-500/10 dark:hover:shadow-emerald-400/10", - buttonBg: "bg-emerald-600 dark:bg-emerald-600", - buttonHover: "hover:bg-emerald-700 dark:hover:bg-emerald-500", - glowColor: "bg-emerald-500/8 dark:bg-emerald-600/15", - dotColor: "bg-emerald-500", - strengthColor: "text-emerald-600 dark:text-emerald-400", - }, - senior: { - accent: "blue", - accentLight: "text-blue-600", - accentDark: "dark:text-blue-400", - iconBg: "bg-blue-100 dark:bg-blue-900/30", - iconText: "text-blue-700 dark:text-blue-300", - badgeBg: "bg-blue-100 dark:bg-blue-900/30", - badgeText: "text-blue-700 dark:text-blue-300", - borderHover: "hover:border-blue-500/40 dark:hover:border-blue-400/30", - shadowHover: "hover:shadow-blue-500/10 dark:hover:shadow-blue-400/10", - buttonBg: "bg-blue-600 dark:bg-blue-600", - buttonHover: "hover:bg-blue-700 dark:hover:bg-blue-500", - glowColor: "bg-blue-500/8 dark:bg-blue-600/15", - dotColor: "bg-blue-500", - strengthColor: "text-blue-600 dark:text-blue-400", - }, - staff: { - accent: "amber", - accentLight: "text-amber-600", - accentDark: "dark:text-amber-400", - iconBg: "bg-amber-100 dark:bg-amber-900/30", - iconText: "text-amber-700 dark:text-amber-300", - badgeBg: "bg-amber-100 dark:bg-amber-900/30", - badgeText: "text-amber-700 dark:text-amber-300", - borderHover: "hover:border-amber-500/40 dark:hover:border-amber-400/30", - shadowHover: "hover:shadow-amber-500/10 dark:hover:shadow-amber-400/10", - buttonBg: "bg-amber-600 dark:bg-amber-600", - buttonHover: "hover:bg-amber-700 dark:hover:bg-amber-500", - glowColor: "bg-amber-500/8 dark:bg-amber-600/15", - dotColor: "bg-amber-500", - strengthColor: "text-amber-600 dark:text-amber-400", - }, - reviewer: { - accent: "violet", - accentLight: "text-violet-600", - accentDark: "dark:text-violet-400", - iconBg: "bg-violet-100 dark:bg-violet-900/30", - iconText: "text-violet-700 dark:text-violet-300", - badgeBg: "bg-violet-100 dark:bg-violet-900/30", - badgeText: "text-violet-700 dark:text-violet-300", - borderHover: "hover:border-violet-500/40 dark:hover:border-violet-400/30", - shadowHover: "hover:shadow-violet-500/10 dark:hover:shadow-violet-400/10", - buttonBg: "bg-violet-600 dark:bg-violet-600", - buttonHover: "hover:bg-violet-700 dark:hover:bg-violet-500", - glowColor: "bg-violet-500/8 dark:bg-violet-600/15", - dotColor: "bg-violet-500", - strengthColor: "text-violet-600 dark:text-violet-400", - }, - autonomous: { - accent: "cyan", - accentLight: "text-cyan-600", - accentDark: "dark:text-cyan-400", - iconBg: "bg-cyan-100 dark:bg-cyan-900/30", - iconText: "text-cyan-700 dark:text-cyan-300", - badgeBg: "bg-cyan-100 dark:bg-cyan-900/30", - badgeText: "text-cyan-700 dark:text-cyan-300", - borderHover: "hover:border-cyan-500/40 dark:hover:border-cyan-400/30", - shadowHover: "hover:shadow-cyan-500/10 dark:hover:shadow-cyan-400/10", - buttonBg: "bg-cyan-600 dark:bg-cyan-600", - buttonHover: "hover:bg-cyan-700 dark:hover:bg-cyan-500", - glowColor: "bg-cyan-500/8 dark:bg-cyan-600/15", - dotColor: "bg-cyan-500", - strengthColor: "text-cyan-600 dark:text-cyan-400", - }, -} - -const DEFAULT_THEME = ROLE_THEMES.senior! +import { pickObjectiveDefaultModelV1 } from "@/lib/objective-default-models-v1" // ── Outcome Layer: Optimization Modes ────────────────────────────────────── @@ -150,9 +21,9 @@ const OPTIMIZATION_MODES: Array<{ label: string description: string }> = [ - { id: "best", label: "Best", description: "Best overall quality across our eval suite." }, - { id: "fastest", label: "Fastest", description: "Lower latency per task when speed matters." }, - { id: "cost", label: "Most cost-effective", description: "Lower cost per task for high-volume work." }, + { id: "best", label: "Quality", description: "Maximize pass rate and overall quality across our eval suite." }, + { id: "fastest", label: "Speed", description: "Lower latency per task when speed matters." }, + { id: "cost", label: "Cost", description: "Lower cost per task for high-volume work." }, ] function isEvalOptimizationMode(value: string): value is EvalOptimizationMode { @@ -167,9 +38,20 @@ function getModeCandidate(rec: RoleRecommendation | undefined, mode: EvalOptimiz } function getModeLabel(mode: EvalOptimizationMode) { - if (mode === "fastest") return "Fastest" - if (mode === "cost") return "Most cost-effective" - return "Best" + if (mode === "fastest") return "Speed" + if (mode === "cost") return "Cost" + return "Quality" +} + +function formatModelIdForUi(modelId: string) { + if (modelId.startsWith("claude-opus-")) { + const rest = modelId.replace(/^claude-opus-/, "") + const parts = rest.split("-").filter(Boolean) + if (parts.length >= 2) return `Opus ${parts[0]}.${parts[1]}` + if (parts.length === 1) return `Opus ${parts[0]}` + } + if (modelId === "kimi-k2-0905") return "Kimi K2" + return modelId } // ── Framer Motion Variants ────────────────────────────────────────────────── @@ -298,16 +180,6 @@ function TimelineTooltip({ // ── Sub-Components ────────────────────────────────────────────────────────── -function StatPill({ icon: Icon, value, label }: { icon: LucideIcon; value: string; label: string }) { - return ( -
- - {value} - {label} -
- ) -} - // ── Main Content Component ────────────────────────────────────────────────── type WorkersContentProps = { @@ -318,23 +190,21 @@ type WorkersContentProps = { totalModels: number lastUpdated: string | undefined workersRootPath?: string - enableOutcomeLayer?: boolean - alternateVersionHref?: string - alternateVersionLabel?: string } +// Outcomes-first is canonical. Baseline/V1 is removed from the UI. +const ENABLE_OUTCOME_LAYER = true + export function WorkersContent({ roles, recommendations, totalEvalRuns, - totalExercises, - totalModels, + totalExercises: _totalExercises, + totalModels: _totalModels, lastUpdated, workersRootPath = "/evals/workers", - enableOutcomeLayer = false, - alternateVersionHref, - alternateVersionLabel, }: WorkersContentProps) { + const enableOutcomeLayer = ENABLE_OUTCOME_LAYER const router = useRouter() const pathname = usePathname() const searchParams = useSearchParams() @@ -374,6 +244,13 @@ export function WorkersContent({ [pathname, router, searchParams], ) + const scrollToOutcomes = useCallback(() => { + if (typeof document === "undefined") return + const el = document.getElementById("outcomes") + if (!el) return + el.scrollIntoView({ behavior: "smooth", block: "start" }) + }, []) + const recByRole = new Map(recommendations.map((r) => [r.roleId, r])) const roleById = useMemo(() => new Map(roles.map((r) => [r.id, r])), [roles]) @@ -383,19 +260,39 @@ export function WorkersContent({ }, [selectedOutcomeId]) const setupQuery = useMemo(() => { - if (!enableOutcomeLayer || !selectedOutcomeId) return "" + if (!selectedOutcomeId) return "" const params = new URLSearchParams() params.set("outcome", selectedOutcomeId) params.set("mode", selectedMode) const query = params.toString() return query ? `?${query}` : "" - }, [enableOutcomeLayer, selectedOutcomeId, selectedMode]) + }, [selectedOutcomeId, selectedMode]) + + const isProfileView = useMemo(() => { + return searchParams.get("view") === "profile" + }, [searchParams]) - const profileTitle = selectedOutcome?.builderProfile?.title ?? "Your Builder Profile" const profileDescription = selectedOutcome?.builderProfile?.description ?? "A default setup built from our eval signals. It’s a baseline, not a guarantee." const profileHowItWorks = selectedOutcome?.builderProfile?.howItWorks ?? selectedOutcome?.whyItWorks ?? [] + const objectiveDefaultModel = useMemo(() => { + if (!selectedOutcomeId) return null + return pickObjectiveDefaultModelV1(selectedOutcomeId, selectedMode) + }, [selectedOutcomeId, selectedMode]) + const objectiveDefaultModelLabel = useMemo(() => { + if (!objectiveDefaultModel?.modelId) return "—" + return formatModelIdForUi(objectiveDefaultModel.modelId) + }, [objectiveDefaultModel]) + const examplePrompt = selectedOutcome?.builderProfile?.examplePrompt ?? "" + const cloudSetupHref = useMemo(() => { + if (!selectedOutcomeId) return "/cloud-agents/setup" + const params = new URLSearchParams() + params.set("outcome", selectedOutcomeId) + params.set("mode", selectedMode) + if (examplePrompt) params.set("prompt", examplePrompt) + return `/cloud-agents/setup?${params.toString()}` + }, [examplePrompt, selectedMode, selectedOutcomeId]) const profileCapabilities = useMemo(() => { if (!selectedOutcome) return [] @@ -414,7 +311,7 @@ export function WorkersContent({ const agentCapabilities = useMemo(() => profileCapabilities.filter((c) => Boolean(c.roleId)), [profileCapabilities]) - const builtInCapabilities = useMemo(() => profileCapabilities.filter((c) => !c.roleId), [profileCapabilities]) + const skillCapabilities = useMemo(() => profileCapabilities.filter((c) => !c.roleId), [profileCapabilities]) // ── Timeline scatter data ────────────────────────────────────────────── const timelineData = useMemo(() => { @@ -460,255 +357,189 @@ export function WorkersContent({
- {/* Blueprint grid overlay (V2) */} - {enableOutcomeLayer ? ( -
- ) : null} + {/* Blueprint grid overlay */} +
{/* Gradient fade from hero atmosphere to cards */}
- {/* Badge */} - -
- - - How we run evals - - - {alternateVersionHref && alternateVersionLabel ? ( - - {alternateVersionLabel} - - ) : null} -
-
- {enableOutcomeLayer ? ( - - Outcomes over artifacts - - ) : null} - - {/* Heading */} - - {enableOutcomeLayer ? ( - <> - Build from outcomes. -
Ship{" "} - - real code - - . - - ) : ( - <> - Build with{" "} - - Roo Code Cloud - - - )} -
- - {/* Subheading */} - - {enableOutcomeLayer ? ( - <> - Pick what you're trying to ship. We assemble a Builder Profile: the - capabilities you need, plus a default model recommendation backed by eval data. - - ) : ( - <> - Outcomes over artifacts: start from the production codebase and ship as a reviewable - PR. Every model runs the same tasks, same tools, and the same time limit. Your repo - will differ—treat this as a baseline. - - )} - - - {enableOutcomeLayer ? ( - - - Start with Prototype → PR - - - - Browse outcomes - - +
+
+
+ {/* Badge */} + +
+ + + How we run evals + + +
+
+ + + Outcomes over artifacts + + + {/* Heading */} + + You’re the Builder + + Ship{" "} + + Real Code + + + + + {/* Subheading */} + + Pick an objective. We’ll suggest an agent lineup and default model + based on eval results. Treat it as a baseline for your repo. + + + + + +
+
+
) : null} - - {/* Stats bar */} - - -
- -
- -
- -

{/* ── Outcomes Overlay ───────────────────────────────────────── */} {enableOutcomeLayer ? ( -
-
+
+
- -

- Start with an outcome -

-

- Pick what you're trying to ship. We assemble a Builder Profile: capabilities - plus a default model recommendation. It's a baseline, not a guarantee. -

-
- - - {EVAL_OUTCOMES.map((outcome) => { - const Icon = outcome.icon - const isSelected = outcome.id === selectedOutcomeId - const isFeatured = outcome.id === "prototype_to_pr" - - return ( - setOutcome(isSelected ? null : outcome.id)} - className={[ - "group rounded-2xl border bg-card/40 p-5 text-left backdrop-blur-sm transition-all duration-200 hover:bg-card/60", - isSelected - ? "border-foreground/20 ring-1 ring-foreground/15" - : "border-border/50 hover:border-border", - isFeatured ? "lg:col-span-2" : "", - ].join(" ")}> -
-
- -
-
- {isFeatured ? ( - - Recommended starting point - - ) : null} -

- {outcome.name} + {isProfileView ? ( + + {selectedOutcome ? ( + <> +

+
+

+ Profile +

+

+ {selectedOutcome.name}

-

- {outcome.description} +

+ {profileDescription}

-
- - ) - })} - - {selectedOutcome ? ( - -
-
-

- {profileTitle} -

-

- {selectedOutcome.name} -

-

- {profileDescription} -

- - {profileHowItWorks.length > 0 ? ( -
-

- {selectedOutcome.builderProfile - ? "How it works" - : "Why it works"} -

-
    - {profileHowItWorks.map((line) => ( -
  • - - {line} -
  • - ))} -
+
+ {examplePrompt ? ( +
+

+ Example prompt +

+
+																{examplePrompt}
+															
+
+ ) : null} + + Start in Roo Code Cloud + + + + Back to objectives + +
- ) : null} +
- {selectedOutcome.builderProfile?.howItWorks ? ( -
-

- Why it works -

-
    - {selectedOutcome.whyItWorks.map((line) => ( -
  • - - {line} -
  • - ))} -
+
+
+
+

+ Optimized for +

+

+ {getModeLabel(selectedMode)} +

+
+
+

+ Default model +

+

+ {objectiveDefaultModelLabel} +

+
+
+

+ Agents +

+

+ {agentCapabilities.length} +

+
+
+

+ Skills +

+

+ {skillCapabilities.length} +

+
- ) : null} -
+
-
-
-
- - Optimize for - +
+

+ Optimize for +

+
{OPTIMIZATION_MODES.map((mode) => { const isSelected = mode.id === selectedMode return ( @@ -719,59 +550,53 @@ export function WorkersContent({ title={mode.description} onClick={() => setMode(mode.id)} className={[ - "inline-flex items-center rounded-full border px-3 py-1.5 text-xs font-semibold transition-colors", + "rounded-full px-3 py-1.5 text-xs font-semibold transition-colors", isSelected - ? "border-foreground/20 bg-foreground/5 text-foreground" - : "border-border/50 bg-background/20 text-foreground/75 hover:border-border hover:text-foreground", + ? "bg-foreground/10 text-foreground" + : "text-muted-foreground hover:text-foreground", ].join(" ")}> {mode.label} ) })}
- - Capability set -
-
-
+
+

- Agents -

-

- Click for candidates & settings + Agent lineup

-
-
- {agentCapabilities.map((capability) => { - const roleId = capability.roleId! - const rec = recByRole.get(roleId) - const candidate = getModeCandidate(rec, selectedMode) - - return ( - -
+
+ {agentCapabilities.map((capability) => { + const roleId = capability.roleId! + const rec = recByRole.get(roleId) + const candidate = getModeCandidate(rec, selectedMode) + + const providerColor = candidate + ? (PROVIDER_COLORS[candidate.provider] ?? "#94a3b8") + : "#94a3b8" + + return ( +
-

+

{capability.name}

- {capability.description ? ( -

- {capability.description} -

- ) : null} -

+

+ {candidate ? ( - <> - - {getModeLabel(selectedMode)}: - {" "} + {candidate.displayName} - + ) : ( View models @@ -779,228 +604,368 @@ export function WorkersContent({ )}

- + + + ) + })} +
+
+ +
+

+ Skills included +

+
+ {skillCapabilities.length > 0 ? ( + skillCapabilities.map((capability) => ( +
+

+ {capability.name} +

+

+ {capability.description} +

- - ) - })} + )) + ) : ( +

+ No skills listed for this profile yet. +

+ )} +
- {builtInCapabilities.length > 0 ? ( -
+ {profileHowItWorks.length > 0 || selectedOutcome.whyItWorks.length > 0 ? ( +

- Built-ins + Rationale

-
- {builtInCapabilities.map((capability) => ( -
-
- -
-
-

- {capability.name} -

- - Built-in - -
-

- {capability.description} -

-
-
-
- ))} +
+
+

+ {selectedOutcome.builderProfile + ? "How it works" + : "Why it works"} +

+
    + {profileHowItWorks.map((line) => ( +
  • + + {line} +
  • + ))} +
+
+
+

+ Why it works +

+
    + {selectedOutcome.whyItWorks.map((line) => ( +
  • + + {line} +
  • + ))} +
+
) : null} + + ) : ( +
+
+

+ No objective selected +

+

+ Pick an objective first, then open the profile view. +

+
+ + Back to objectives + +
-
+ )} ) : null} - -
-
- ) : null} - {/* ── Role Cards Grid (baseline only) ────────────────────────── */} - {!enableOutcomeLayer ? ( -
- {/* Subtle section background */} - -
-
-
- - -
- {/* Section connector */} - -

- Choose a setup for the work -

- -
+
+ {/* Left rail: objective + mode */} +
+ +

+ Select your objective +

+
- - {roles.map((role) => { - const rec = recByRole.get(role.id) - const IconComponent = ICON_MAP[role.icon] ?? Code - const candidateCount = rec?.allCandidates.length ?? 0 - const exerciseCount = rec?.totalExercises ?? 0 - const theme = ROLE_THEMES[role.id] ?? DEFAULT_THEME - const topModel = rec?.best[0] ?? null - - return ( -
- {/* Subtle glow on hover */} -
+ className="mt-6 flex flex-wrap items-center justify-center gap-2 lg:justify-start" + variants={fadeUpVariants}> + + Optimize for + +
+ {OPTIMIZATION_MODES.map((mode) => { + const isSelected = mode.id === selectedMode + return ( + + ) + })} +
+ -
- {/* Header: Icon + role badge */} -
-
- + + {EVAL_OUTCOMES.map((outcome) => { + const Icon = outcome.icon + const isSelected = outcome.id === selectedOutcomeId + + return ( + setOutcome(isSelected ? null : outcome.id)} + className={[ + "group w-full rounded-2xl border bg-card/35 p-4 text-left backdrop-blur-sm transition-all duration-200 hover:bg-card/55", + isSelected + ? "border-foreground/20 ring-1 ring-foreground/15" + : "border-border/50 hover:border-border", + ].join(" ")}> +
+
+ +
+
+
+

+ {outcome.name} +

+
+

+ {outcome.description} +

+
- {topModel && ( - - Top: {topModel.displayName} - - )} -
- - {/* Profile name + descriptor */} -

{role.name}

-

- {role.salaryRange} -

+ + ) + })} + +
- {/* Description */} -

- {role.description} -

+ {/* Right rail: profile snapshot */} +
+ +
+
- {/* Best for */} -
-

- Best for -

-
- {role.bestFor.map((item) => ( - - {item} - - ))} +
+
+
+
+

+ Profile snapshot +

+

+ {selectedOutcome + ? selectedOutcome.name + : "Pick an objective"} +

+

+ {selectedOutcome + ? profileDescription + : "Select an objective to see the suggested lineup and default model."} +

-
- {/* Strengths & Weaknesses side by side */} -
- {/* Strengths */} -
-

- Strengths -

-
    - {role.strengths.map((item) => ( -
  • - - {item} -
  • - ))} -
+
+ + Optimized for: {getModeLabel(selectedMode)} +
+
- {/* Weaknesses */} -
-

- Trade-offs -

-
    - {role.weaknesses.map((item) => ( -
  • - - {item} -
  • - ))} -
+
+
+
+

+ Signal +

+

+ {totalEvalRuns.toLocaleString()} runs +

+
+
+

+ Agents +

+

+ {agentCapabilities.length} +

+
+
+

+ Default model +

+

+ {objectiveDefaultModelLabel} +

+
+
- {/* Bottom stats + CTA */} -
-
- - - {candidateCount} models - - - - {exerciseCount.toLocaleString()} exercises - -
+
+
+

+ Agent lineup +

+

+ Open candidates & settings +

+
- - View models - - +
+ {selectedOutcome ? ( +
+ {agentCapabilities.map((capability) => { + const roleId = capability.roleId! + const rec = recByRole.get(roleId) + const candidate = getModeCandidate(rec, selectedMode) + + const providerColor = candidate + ? (PROVIDER_COLORS[candidate.provider] ?? "#94a3b8") + : "#94a3b8" + + return ( + + +
+
+

+ {capability.name} +

+

+ + {candidate ? ( + + {candidate.displayName} + + ) : ( + + View models + + )} +

+
+ +
+ + ) + })} +
+ ) : ( +
+

+ No objective selected +

+

+ Pick an objective to see the recommended agent lineup. +

+
+ )}
+ + {selectedOutcome ? ( + <> + {examplePrompt ? ( +
+

+ Example prompt +

+
+																	{examplePrompt}
+																
+
+ ) : null} +
+ + Start in Roo Code Cloud + + + + Learn more / customize + + +
+ + ) : null}
- ) - })} +
+
) : null} {/* ── AI Coding Capability Over Time ─────────────────────────── */} -
+
{/* Subtle atmospheric background */}

- Pass rates on our eval suite, by model release date. The best ones now score 100%. + Pass rates on our eval suite by model release date. Several current models hit 100% on + this suite.

diff --git a/apps/web-roo-code/src/lib/eval-outcomes.ts b/apps/web-roo-code/src/lib/eval-outcomes.ts index 85ced98347e..4ff9e9b588e 100644 --- a/apps/web-roo-code/src/lib/eval-outcomes.ts +++ b/apps/web-roo-code/src/lib/eval-outcomes.ts @@ -23,6 +23,11 @@ export type EvalOutcomeCapability = { export type EvalOutcomeProfile = { title: string description: string + /** + * Starter prompt shown in the UI to help users understand what to ask for. + * This is product copy, not an eval artifact. + */ + examplePrompt?: string capabilities: EvalOutcomeCapability[] howItWorks: string[] } @@ -46,6 +51,69 @@ export type EvalOutcome = { } export const EVAL_OUTCOMES: EvalOutcome[] = [ + { + id: "review_guardrails", + name: "Idea → Prototype", + description: "Turn a vague idea into a working demo in your real codebase.", + icon: Sparkles, + recommendedRoleIds: ["autonomous", "senior"], + whyItWorks: [ + "Optimizes for momentum: map the codebase fast, then build a working slice.", + "Senior builder keeps the prototype grounded in production constraints.", + ], + builderProfile: { + title: "Your Builder Profile", + description: "For turning an idea into a working demo in your repo.", + examplePrompt: `Objective: Idea → Prototype + +In this repo, turn this idea into a working demo: . + +Constraints: +- Keep scope small and demo-first. +- Use the existing stack and patterns in this codebase. + +Deliver: +- A reviewable PR +- A short walkthrough (how to run it, what works, what’s next)`, + capabilities: [ + { + id: "autonomous_researcher", + name: "Autonomous Researcher", + description: "Maps the codebase, constraints, and best path forward before implementation starts.", + roleId: "autonomous", + }, + { + id: "multi_file_builder", + name: "Senior Builder", + description: "Builds a working prototype directly in your repo across the files it touches.", + roleId: "senior", + }, + { + id: "discovery_loop", + name: "Discovery loop", + description: + "Maps the codebase and constraints before making changes (so the prototype fits reality).", + }, + { + id: "prototype_scaffold", + name: "Prototype scaffold", + description: "Creates the smallest working slice you can demo and build on.", + }, + { + id: "demo_ready_output", + name: "Demo-ready output", + description: + "Delivers a reviewable diff plus a clear walkthrough of what’s working and what’s next.", + }, + ], + howItWorks: [ + "Clarify the objective and success criteria.", + "Explore the codebase and pick the smallest viable implementation path.", + "Build the prototype directly in the repo (no throwaway export/import step).", + "Deliver a demo-ready diff with notes for the next iteration.", + ], + }, + }, { id: "prototype_to_pr", name: "Prototype → PR", @@ -58,8 +126,17 @@ export const EVAL_OUTCOMES: EvalOutcome[] = [ ], builderProfile: { title: "Your Builder Profile", - description: - "A default set of capabilities for turning a working prototype into a reviewable PR—on the production codebase.", + description: "For turning a prototype into a reviewable PR on the production codebase.", + examplePrompt: `Objective: Prototype → PR + +Take the current prototype implementation and turn it into a reviewable PR. + +Do: +- Tighten scope to the smallest shippable diff +- Add/adjust tests, lint, and typechecks as needed + +Deliver: +- A PR-ready diff with a plain-English summary and review notes`, capabilities: [ { id: "multi_file_builder", @@ -110,59 +187,260 @@ export const EVAL_OUTCOMES: EvalOutcome[] = [ }, }, { - id: "paper_cuts", - name: "Paper cuts & small fixes", - description: "Fix the small stuff without dragging engineers off big projects.", - icon: CheckCircle2, - recommendedRoleIds: ["junior", "reviewer"], + id: "issue_to_pr", + name: "Issue → PR", + description: "Run end-to-end work in the background and come back to a reviewable result.", + icon: GitPullRequest, + recommendedRoleIds: ["autonomous", "reviewer"], whyItWorks: [ - "Small diffs are high-leverage when the work is well-scoped.", - "Reviewer keeps the quality bar and reduces surprise.", + "Handles out-of-band work while humans stay on the roadmap.", + "Pairs autonomy with guardrails for merge safety.", ], + builderProfile: { + title: "Your Builder Profile", + description: "For turning an issue into a reviewable PR.", + examplePrompt: `Objective: Issue → PR + +Fix this issue in the repo: . + +Requirements: +- Define “done” in 2-3 acceptance criteria +- Implement the fix and validate it (tests/lint/typechecks) + +Deliver: +- A reviewable PR with context and any follow-ups`, + capabilities: [ + { + id: "autonomous_executor", + name: "Autonomous Executor", + description: "Runs the full loop (investigate → implement → validate) while you stay unblocked.", + roleId: "autonomous", + }, + { + id: "reviewer_guardrails", + name: "Reviewer & Guardrails", + description: "Reviews the diff for correctness, edge cases, and merge safety.", + roleId: "reviewer", + }, + { + id: "issue_intake", + name: "Issue intake", + description: + "Translates a request into scoped tasks, acceptance criteria, and a safe plan of attack.", + }, + { + id: "validation_loop", + name: "Validation loop", + description: "Runs tests/lint/typechecks and iterates until it’s clean (or flags what’s blocked).", + }, + { + id: "pr_ready_output", + name: "PR-ready output", + description: "Produces a focused diff plus a plain-English summary and review notes.", + }, + ], + howItWorks: [ + "Clarify the issue and define what “done” means.", + "Implement in the background with frequent validation checkpoints.", + "Run a reviewer pass to reduce merge risk.", + "Deliver a PR-ready result with context and next steps.", + ], + }, }, { id: "sentry_triage", - name: "Sentry triage", - description: "Turn recurring errors into concrete fixes with proof before review.", + name: "Customer Escalation → Resolved", + description: "Triage a customer-blocking issue and ship the smallest safe fix.", icon: Bug, - recommendedRoleIds: ["autonomous", "reviewer"], + recommendedRoleIds: ["autonomous", "senior", "reviewer"], whyItWorks: [ "Autonomous runs handle multi-step investigation and iteration.", + "Senior builder makes the final fix precise and production-safe.", "Reviewer focuses on safety, correctness, and “does this hold up?”.", ], + builderProfile: { + title: "Your Builder Profile", + description: "For resolving a customer escalation quickly and safely.", + examplePrompt: `Objective: Customer Escalation → Resolved + +We have a customer-blocking escalation: +- Symptoms: +- Context: + +Do: +- Find the smallest safe fix with a clear blast-radius assessment +- Add guardrails/tests where it makes sense + +Deliver: +- A PR with the fix and a short “risk + rollout” note`, + capabilities: [ + { + id: "autonomous_triage", + name: "Autonomous Triage", + description: "Investigates logs, context, and repro steps to converge on a fix quickly.", + roleId: "autonomous", + }, + { + id: "senior_fixer", + name: "Senior Builder", + description: "Implements the smallest production-safe fix when the blast radius is unclear.", + roleId: "senior", + }, + { + id: "reviewer_guardrails", + name: "Reviewer & Guardrails", + description: "Double-checks safety and correctness so speed doesn’t create regressions.", + roleId: "reviewer", + }, + { + id: "repro_first", + name: "Repro-first", + description: "Prioritizes a minimal reproduction so we know the fix actually fixes the issue.", + }, + { + id: "minimal_fix", + name: "Minimal safe fix", + description: "Ships the smallest change that unblocks customers, with a clear rollback story.", + }, + { + id: "verification_artifacts", + name: "Verification artifacts", + description: "Provides proof (tests/logs/steps) that the fix works and what it covers.", + }, + ], + howItWorks: [ + "Gather context and reproduce the customer issue.", + "Implement the smallest safe fix with verification.", + "Run a reviewer pass to catch edge cases.", + "Deliver a PR-ready result plus rollout notes.", + ], + }, }, { id: "repro_to_fix", - name: "Bug repro → fix", - description: "Make the handoff less lossy: reproduce, patch, and validate in one loop.", + name: "Bug Report → Fix", + description: "Reproduce, isolate, patch, and validate in one loop.", icon: Workflow, recommendedRoleIds: ["senior", "reviewer"], whyItWorks: [ "Good default for ambiguous bugs that touch a few files.", "Reviewer helps catch cross-team assumptions early.", ], + builderProfile: { + title: "Your Builder Profile", + description: "For turning a bug report into a verified fix.", + examplePrompt: `Objective: Bug Report → Fix + +Fix this bug: +- Report: +- Expected vs actual: + +Do: +- Reproduce if possible, then implement the fix +- Validate with tests/lint/typechecks (or explain what’s blocked) + +Deliver: +- A PR with the fix and verification notes`, + capabilities: [ + { + id: "bug_fixer", + name: "Bug Fixer", + description: "Reproduces and fixes bugs efficiently across the files involved.", + roleId: "senior", + }, + { + id: "reviewer_guardrails", + name: "Reviewer & Guardrails", + description: "Reviews the diff for correctness and regression risk before it ships.", + roleId: "reviewer", + }, + { + id: "repro_harness", + name: "Repro harness", + description: + "Creates a minimal reproduction path (tests or steps) to prevent “can’t repro” stalls.", + }, + { + id: "fix_with_tests", + name: "Fix with tests", + description: "Pairs the fix with verification so it doesn’t regress on the next change.", + }, + { + id: "validation_loop", + name: "Validation loop", + description: "Runs tests/lint/typechecks and iterates until it’s clean (or flags what’s blocked).", + }, + ], + howItWorks: [ + "Reproduce the issue and isolate the root cause.", + "Implement a targeted fix with verification.", + "Run a reviewer pass to reduce regression risk.", + "Deliver a PR-ready result with steps to validate.", + ], + }, }, { - id: "review_guardrails", - name: "Guardrails & review", - description: "Raise the quality bar without becoming the blocker.", - icon: GitPullRequest, - recommendedRoleIds: ["reviewer"], - whyItWorks: [ - "Works alongside CI, linters, and team review.", - "Scales judgement through fast, consistent feedback.", - ], - }, - { - id: "issue_to_pr", - name: "Issue → PR", - description: "Run end-to-end work in the background and come back to a reviewable result.", - icon: GitPullRequest, - recommendedRoleIds: ["autonomous", "reviewer"], + id: "paper_cuts", + name: "Paper Cuts → Shipped", + description: "Fix the small stuff without dragging engineers off big projects.", + icon: CheckCircle2, + recommendedRoleIds: ["junior", "reviewer"], whyItWorks: [ - "Handles out-of-band work while humans stay on the roadmap.", - "Pairs autonomy with guardrails for merge safety.", + "Small diffs are high-leverage when the work is well-scoped.", + "Reviewer keeps the quality bar and reduces surprise.", ], + builderProfile: { + title: "Your Builder Profile", + description: "For shipping small fixes quickly, cleanly, and safely.", + examplePrompt: `Objective: Paper Cuts → Shipped + +Ship these small fixes in this repo: +- +- +- + +Constraints: +- Keep diffs small and easy to review +- Don’t change behavior unless it’s clearly a bug + +Deliver: +- A PR with grouped, well-scoped commits and a short summary`, + capabilities: [ + { + id: "small_diff_builder", + name: "Small-diff Builder", + description: "Ships focused fixes with low review surface area and minimal risk.", + roleId: "junior", + }, + { + id: "reviewer_guardrails", + name: "Reviewer & Guardrails", + description: "Catches edge cases and keeps changes aligned with team conventions.", + roleId: "reviewer", + }, + { + id: "scope_control", + name: "Scope control", + description: "Keeps changes tight: fewer surprises, faster reviews, easier merges.", + }, + { + id: "quick_validation", + name: "Quick validation", + description: "Runs the relevant checks and flags what’s safe to skip (and what’s not).", + }, + { + id: "pr_ready_output", + name: "PR-ready output", + description: "Produces a focused diff plus a plain-English summary and review notes.", + }, + ], + howItWorks: [ + "Pick the smallest fix that moves the needle.", + "Implement with tight scope control.", + "Validate quickly and review for conventions.", + "Deliver a PR-ready result you can merge confidently.", + ], + }, }, ] diff --git a/apps/web-roo-code/src/lib/mock-recommendations.ts b/apps/web-roo-code/src/lib/mock-recommendations.ts index 9a08487ca7d..2e30b5ec0b9 100644 --- a/apps/web-roo-code/src/lib/mock-recommendations.ts +++ b/apps/web-roo-code/src/lib/mock-recommendations.ts @@ -302,7 +302,7 @@ const seniorCandidates: ModelCandidate[] = [ { provider: "moonshot", modelId: "kimi-k2-0905", - displayName: "Kimi K2 0905", + displayName: "Kimi K2", compositeScore: 95, tier: "best", tags: ["budget-hire", "best-value"], @@ -439,7 +439,7 @@ const staffCandidates: ModelCandidate[] = [ { provider: "anthropic", modelId: "claude-opus-4-6", - displayName: "Claude Opus 4.6", + displayName: "Opus 4.6", compositeScore: 98, tier: "best", tags: ["speed-hire", "top-performer"], @@ -467,7 +467,7 @@ const staffCandidates: ModelCandidate[] = [ { provider: "anthropic", modelId: "claude-opus-4-5", - displayName: "Claude Opus 4.5", + displayName: "Opus 4.5", compositeScore: 96, tier: "recommended", tags: [], @@ -495,7 +495,7 @@ const staffCandidates: ModelCandidate[] = [ { provider: "anthropic", modelId: "claude-opus-4-1", - displayName: "Claude Opus 4.1", + displayName: "Opus 4.1", compositeScore: 73, tier: "situational", tags: [], @@ -525,7 +525,7 @@ const staffCandidates: ModelCandidate[] = [ { provider: "anthropic", modelId: "claude-opus-4", - displayName: "Claude Opus 4", + displayName: "Opus 4", compositeScore: 57, tier: "not-recommended", tags: [], @@ -565,7 +565,7 @@ const reviewerCandidates: ModelCandidate[] = [ { provider: "anthropic", modelId: "claude-opus-4-6", - displayName: "Claude Opus 4.6", + displayName: "Opus 4.6", compositeScore: 95, tier: "best", tags: ["speed-hire", "top-performer"], @@ -731,7 +731,7 @@ const autonomousCandidates: ModelCandidate[] = [ { provider: "moonshot", modelId: "kimi-k2-0905", - displayName: "Kimi K2 0905", + displayName: "Kimi K2", compositeScore: 86, tier: "recommended", tags: [], @@ -885,6 +885,6 @@ export const MODEL_TIMELINE: ModelTimelineEntry[] = [ { modelName: "GPT-5 Mini", provider: "openai", releaseDate: "2025-12-01", score: 99, costPerRun: 3.34 }, { modelName: "Claude Sonnet 4.5", provider: "anthropic", releaseDate: "2026-01-15", score: 100, costPerRun: 38.43 }, { modelName: "GPT 5.2 (Med)", provider: "openai", releaseDate: "2026-01-20", score: 100, costPerRun: 12.5 }, - { modelName: "Claude Opus 4.6", provider: "anthropic", releaseDate: "2026-02-01", score: 100, costPerRun: 49.48 }, + { modelName: "Opus 4.6", provider: "anthropic", releaseDate: "2026-02-01", score: 100, costPerRun: 49.48 }, { modelName: "Gemini 3 Pro", provider: "google", releaseDate: "2026-02-05", score: 100, costPerRun: 33.06 }, ] diff --git a/apps/web-roo-code/src/lib/objective-default-models-v1.ts b/apps/web-roo-code/src/lib/objective-default-models-v1.ts new file mode 100644 index 00000000000..e60480d908c --- /dev/null +++ b/apps/web-roo-code/src/lib/objective-default-models-v1.ts @@ -0,0 +1,227 @@ +import type { EvalOutcomeId } from "./eval-outcomes" + +type ObjectiveMetric = { score: number; costUsd: number; runtimeS: number } + +type ModelObjectiveMetrics = { + modelId: string + issueResolution: ObjectiveMetric + frontend: ObjectiveMetric + greenfield: ObjectiveMetric + testing: ObjectiveMetric + infoGathering: ObjectiveMetric +} + +type EvalOptimizationModeV1 = "best" | "fastest" | "cost" + +type ObjectiveWeights = { + issueResolution: number + frontend: number + greenfield: number + testing: number + infoGathering: number +} + +type WeightedObjectiveMetrics = { score: number; costUsd: number; runtimeS: number } + +export type ObjectiveDefaultModelV1 = { + modelId: string + weighted: WeightedObjectiveMetrics +} + +const MODEL_METRICS_V1: ModelObjectiveMetrics[] = [ + { + modelId: "claude-opus-4-6", + issueResolution: { score: 74.8, costUsd: 0.56, runtimeS: 178 }, + frontend: { score: 41.8, costUsd: 2.37, runtimeS: 602 }, + greenfield: { score: 43.8, costUsd: 2.5, runtimeS: 388 }, + testing: { score: 78.8, costUsd: 0.43, runtimeS: 138 }, + infoGathering: { score: 80, costUsd: 1.33, runtimeS: 526 }, + }, + { + modelId: "GPT-5.2-Codex", + issueResolution: { score: 73.8, costUsd: 0.94, runtimeS: 438 }, + frontend: { score: 35.9, costUsd: 2.97, runtimeS: 1434 }, + greenfield: { score: 62.5, costUsd: 2.5, runtimeS: 838 }, + testing: { score: 62.5, costUsd: 0.66, runtimeS: 343 }, + infoGathering: { score: 70.9, costUsd: 1.66, runtimeS: 799 }, + }, + { + modelId: "claude-opus-4-5", + issueResolution: { score: 76.6, costUsd: 1.82, runtimeS: 325 }, + frontend: { score: 41.2, costUsd: 2.54, runtimeS: 671 }, + greenfield: { score: 37.5, costUsd: 4.65, runtimeS: 495 }, + testing: { score: 78.5, costUsd: 1.38, runtimeS: 268 }, + infoGathering: { score: 69.1, costUsd: 0.55, runtimeS: 97 }, + }, + { + modelId: "MiniMax-M2.5", + issueResolution: { score: 72.6, costUsd: 0.1, runtimeS: 455 }, + frontend: { score: 25, costUsd: 0.15, runtimeS: 611 }, + greenfield: { score: 50, costUsd: 0.16, runtimeS: 376 }, + testing: { score: 68.1, costUsd: 0.07, runtimeS: 389 }, + infoGathering: { score: 47.9, costUsd: 0.06, runtimeS: 716 }, + }, + { + modelId: "GPT-5.2", + issueResolution: { score: 74.6, costUsd: 0.86, runtimeS: 476 }, + frontend: { score: 30.9, costUsd: 2.77, runtimeS: 1571 }, + greenfield: { score: 18.8, costUsd: 0.71, runtimeS: 397 }, + testing: { score: 73.2, costUsd: 0.56, runtimeS: 347 }, + infoGathering: { score: 65.5, costUsd: 0.48, runtimeS: 189 }, + }, + { + modelId: "claude-sonnet-4-5", + issueResolution: { score: 74.2, costUsd: 1.19, runtimeS: 534 }, + frontend: { score: 36.8, costUsd: 1.89, runtimeS: 787 }, + greenfield: { score: 12.5, costUsd: 2.65, runtimeS: 744 }, + testing: { score: 68.8, costUsd: 0.98, runtimeS: 488 }, + infoGathering: { score: 58.8, costUsd: 0.38, runtimeS: 126 }, + }, + { + modelId: "Kimi-K2.5", + issueResolution: { score: 68.8, costUsd: 0.48, runtimeS: 707 }, + frontend: { score: 32.8, costUsd: 1.58, runtimeS: 921 }, + greenfield: { score: 18.8, costUsd: 0.96, runtimeS: 814 }, + testing: { score: 61.9, costUsd: 0.42, runtimeS: 385 }, + infoGathering: { score: 63.6, costUsd: 0.39, runtimeS: 602 }, + }, + { + modelId: "Gemini-3-Flash", + issueResolution: { score: 74.6, costUsd: 0.42, runtimeS: 343 }, + frontend: { score: 22.1, costUsd: 0.8, runtimeS: 1152 }, + greenfield: { score: 18.8, costUsd: 0.82, runtimeS: 399 }, + testing: { score: 70.7, costUsd: 0.3, runtimeS: 213 }, + infoGathering: { score: 58.8, costUsd: 0.38, runtimeS: 398 }, + }, + { + modelId: "DeepSeek-V3.2-Reasoner", + issueResolution: { score: 71.6, costUsd: 0.16, runtimeS: 1429 }, + frontend: { score: 27.9, costUsd: 0.19, runtimeS: 1515 }, + greenfield: { score: 31.2, costUsd: 0.12, runtimeS: 1411 }, + testing: { score: 53.6, costUsd: 0.12, runtimeS: 1215 }, + infoGathering: { score: 50.3, costUsd: 0.06, runtimeS: 427 }, + }, + { + modelId: "Gemini-3-Pro", + issueResolution: { score: 70.6, costUsd: 0.95, runtimeS: 343 }, + frontend: { score: 36.8, costUsd: 1.46, runtimeS: 710 }, + greenfield: { score: 12.5, costUsd: 2.68, runtimeS: 554 }, + testing: { score: 68.6, costUsd: 1.01, runtimeS: 386 }, + infoGathering: { score: 44.2, costUsd: 1.5, runtimeS: 1775 }, + }, + { + modelId: "MiniMax-M2.1", + issueResolution: { score: 68.8, costUsd: 0.14, runtimeS: 579 }, + frontend: { score: 16.2, costUsd: 0.21, runtimeS: 1417 }, + greenfield: { score: 25, costUsd: 0.33, runtimeS: 826 }, + testing: { score: 61.4, costUsd: 0.11, runtimeS: 473 }, + infoGathering: { score: 40.6, costUsd: 0.06, runtimeS: 641 }, + }, + { + modelId: "GLM-4.7", + issueResolution: { score: 73.4, costUsd: 0.56, runtimeS: 1007 }, + frontend: { score: 22.1, costUsd: 0.66, runtimeS: 1519 }, + greenfield: { score: 12.5, costUsd: 0.54, runtimeS: 578 }, + testing: { score: 49.4, costUsd: 0.37, runtimeS: 744 }, + infoGathering: { score: 53.9, costUsd: 0.46, runtimeS: 1138 }, + }, + { + modelId: "Kimi-K2-Thinking", + issueResolution: { score: 69.2, costUsd: 2, runtimeS: 1325 }, + frontend: { score: 32.4, costUsd: 2.31, runtimeS: 1641 }, + greenfield: { score: 18.8, costUsd: 6.78, runtimeS: 2314 }, + testing: { score: 47.3, costUsd: 1.39, runtimeS: 1253 }, + infoGathering: { score: 43.6, costUsd: 0.65, runtimeS: 279 }, + }, + { + modelId: "Qwen3-Coder-480B", + issueResolution: { score: 62.4, costUsd: 1.26, runtimeS: 680 }, + frontend: { score: 23.5, costUsd: 2.09, runtimeS: 1006 }, + greenfield: { score: 0, costUsd: 1.79, runtimeS: 924 }, + testing: { score: 34.9, costUsd: 0.97, runtimeS: 626 }, + infoGathering: { score: 33.9, costUsd: 0.28, runtimeS: 197 }, + }, +] + +function getOutcomeWeights(outcomeId: EvalOutcomeId): ObjectiveWeights { + // These are intentionally opinionated. They exist to make the prototype feel realistic + // before we wire real Roo Code Cloud evals. + switch (outcomeId) { + // Idea → Prototype + case "review_guardrails": + return { greenfield: 0.5, infoGathering: 0.35, frontend: 0.1, testing: 0.05, issueResolution: 0 } + // Prototype → PR + case "prototype_to_pr": + return { greenfield: 0.35, testing: 0.35, issueResolution: 0.2, frontend: 0.1, infoGathering: 0 } + // Issue → PR + case "issue_to_pr": + return { issueResolution: 0.4, testing: 0.3, infoGathering: 0.2, frontend: 0.1, greenfield: 0 } + // Customer Escalation → Resolved + case "sentry_triage": + return { issueResolution: 0.55, infoGathering: 0.25, testing: 0.2, frontend: 0, greenfield: 0 } + // Bug Report → Fix + case "repro_to_fix": + return { issueResolution: 0.45, testing: 0.4, infoGathering: 0.15, frontend: 0, greenfield: 0 } + // Paper Cuts → Shipped + case "paper_cuts": + return { frontend: 0.6, issueResolution: 0.2, testing: 0.2, greenfield: 0, infoGathering: 0 } + } +} + +function getWeightedMetrics(row: ModelObjectiveMetrics, weights: ObjectiveWeights): WeightedObjectiveMetrics { + const score = + row.issueResolution.score * weights.issueResolution + + row.frontend.score * weights.frontend + + row.greenfield.score * weights.greenfield + + row.testing.score * weights.testing + + row.infoGathering.score * weights.infoGathering + const costUsd = + row.issueResolution.costUsd * weights.issueResolution + + row.frontend.costUsd * weights.frontend + + row.greenfield.costUsd * weights.greenfield + + row.testing.costUsd * weights.testing + + row.infoGathering.costUsd * weights.infoGathering + const runtimeS = + row.issueResolution.runtimeS * weights.issueResolution + + row.frontend.runtimeS * weights.frontend + + row.greenfield.runtimeS * weights.greenfield + + row.testing.runtimeS * weights.testing + + row.infoGathering.runtimeS * weights.infoGathering + return { score, costUsd, runtimeS } +} + +function pickByMode( + rows: Array<{ modelId: string; weighted: WeightedObjectiveMetrics }>, + mode: EvalOptimizationModeV1, +): { modelId: string; weighted: WeightedObjectiveMetrics } { + const bestByQuality = rows.reduce((best, cur) => (cur.weighted.score > best.weighted.score ? cur : best)) + + // For speed/cost modes, don't pick a model that is dramatically worse on quality. + // This keeps the v1 prototype recommendations feeling credible even when a model is + // extremely cheap or fast but underperforms for the selected objective. + const QUALITY_FLOOR = 0.85 + const qualityThreshold = bestByQuality.weighted.score * QUALITY_FLOOR + const qualityGated = rows.filter((r) => r.weighted.score >= qualityThreshold) + const pool = qualityGated.length > 0 ? qualityGated : rows + + if (mode === "fastest") { + return pool.reduce((best, cur) => (cur.weighted.runtimeS < best.weighted.runtimeS ? cur : best)) + } + if (mode === "cost") { + return pool.reduce((best, cur) => (cur.weighted.costUsd < best.weighted.costUsd ? cur : best)) + } + return bestByQuality +} + +export function pickObjectiveDefaultModelV1( + outcomeId: EvalOutcomeId, + mode: EvalOptimizationModeV1, +): ObjectiveDefaultModelV1 | null { + const weights = getOutcomeWeights(outcomeId) + const candidates = MODEL_METRICS_V1.map((row) => ({ + modelId: row.modelId, + weighted: getWeightedMetrics(row, weights), + })) + if (candidates.length === 0) return null + return pickByMode(candidates, mode) +} From b3fb878c71b15879e983f14761396e8f22207103 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Thu, 12 Feb 2026 19:20:50 -0800 Subject: [PATCH 12/22] Fix workers-v2 redirect util imports --- .../src/app/evals/workers-v2/[roleId]/compare/page.tsx | 2 +- apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx index ceb688d8a7c..f6e566c1f1c 100644 --- a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/compare/page.tsx @@ -1,6 +1,6 @@ import { permanentRedirect } from "next/navigation" -import { buildQueryString, type RedirectSearchParams } from "../../../_redirect-utils" +import { buildQueryString, type RedirectSearchParams } from "../../_redirect-utils" type PageProps = { params: Promise<{ roleId: string }> diff --git a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx index affc2355154..5b921157f8a 100644 --- a/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers-v2/[roleId]/page.tsx @@ -1,6 +1,6 @@ import { permanentRedirect } from "next/navigation" -import { buildQueryString, type RedirectSearchParams } from "../../_redirect-utils" +import { buildQueryString, type RedirectSearchParams } from "../_redirect-utils" type PageProps = { params: Promise<{ roleId: string }> From 8cf93a786fef0aa2890ba59ec040bc7f91c264f5 Mon Sep 17 00:00:00 2001 From: Roo Code Date: Fri, 13 Feb 2026 03:39:25 +0000 Subject: [PATCH 13/22] fix(web-evals): lift Fraunces/IBM Plex Sans font setup to shared evals layout Move --font-display and --font-body CSS variable declarations from workers/page.tsx into a new evals/layout.tsx so all evals sub-pages (methodology, workers, workers/[roleId], etc.) inherit the font variables without duplicating the setup. --- apps/web-roo-code/src/app/evals/layout.tsx | 8 +++++++ .../src/app/evals/workers/page.tsx | 24 +++++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) create mode 100644 apps/web-roo-code/src/app/evals/layout.tsx diff --git a/apps/web-roo-code/src/app/evals/layout.tsx b/apps/web-roo-code/src/app/evals/layout.tsx new file mode 100644 index 00000000000..f6ebd164a4a --- /dev/null +++ b/apps/web-roo-code/src/app/evals/layout.tsx @@ -0,0 +1,8 @@ +import { Fraunces, IBM_Plex_Sans } from "next/font/google" + +const display = Fraunces({ subsets: ["latin"], variable: "--font-display" }) +const body = IBM_Plex_Sans({ subsets: ["latin"], weight: ["400", "500", "600"], variable: "--font-body" }) + +export default function EvalsLayout({ children }: { children: React.ReactNode }) { + return
{children}
+} diff --git a/apps/web-roo-code/src/app/evals/workers/page.tsx b/apps/web-roo-code/src/app/evals/workers/page.tsx index 718011b56dc..0e6eaae44ce 100644 --- a/apps/web-roo-code/src/app/evals/workers/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers/page.tsx @@ -1,5 +1,4 @@ import type { Metadata } from "next" -import { Fraunces, IBM_Plex_Sans } from "next/font/google" import { SEO } from "@/lib/seo" import { ogImageUrl } from "@/lib/og" @@ -15,9 +14,6 @@ const DESCRIPTION = const OG_DESCRIPTION = "Outcome-first recommendations for shipping production code" const PATH = "/evals/workers" -const display = Fraunces({ subsets: ["latin"], variable: "--font-display" }) -const body = IBM_Plex_Sans({ subsets: ["latin"], weight: ["400", "500", "600"], variable: "--font-body" }) - export const metadata: Metadata = { title: TITLE, description: DESCRIPTION, @@ -79,16 +75,14 @@ export default function WorkersPage() { .pop() return ( -
- -
+ ) } From 1d03382f268dd8869ae9855a0275ec582fcc8871 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Thu, 12 Feb 2026 20:49:17 -0800 Subject: [PATCH 14/22] Refine evals objective selection and recommendation URL defaults --- .../evals/methodology/methodology-content.tsx | 8 +- .../recommendations/[roleId]/compare/page.tsx | 2 + .../evals/recommendations/[roleId]/page.tsx | 2 + .../src/app/evals/recommendations/page.tsx | 94 +++++++++++++++++++ .../src/app/evals/workers-v2/page.tsx | 2 +- .../workers/[roleId]/candidates-content.tsx | 2 +- .../[roleId]/compare/comparison-chart.tsx | 2 +- .../evals/workers/[roleId]/compare/page.tsx | 4 +- .../src/app/evals/workers/[roleId]/page.tsx | 4 +- .../src/app/evals/workers/workers-content.tsx | 43 ++++++--- 10 files changed, 138 insertions(+), 25 deletions(-) create mode 100644 apps/web-roo-code/src/app/evals/recommendations/[roleId]/compare/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/recommendations/[roleId]/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/recommendations/page.tsx diff --git a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx index 852356c6c80..6bca818da15 100644 --- a/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx +++ b/apps/web-roo-code/src/app/evals/methodology/methodology-content.tsx @@ -147,7 +147,9 @@ export function MethodologyContent() { Evals / - + Recommendations / @@ -183,7 +185,7 @@ export function MethodologyContent() { variants={fadeUpVariants} className="mt-10 flex flex-wrap items-center gap-5"> View recommendations @@ -500,7 +502,7 @@ export function MethodologyContent() { Links

- +
diff --git a/apps/web-roo-code/src/app/evals/recommendations/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/recommendations/[roleId]/compare/page.tsx new file mode 100644 index 00000000000..24ceb8946c6 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/recommendations/[roleId]/compare/page.tsx @@ -0,0 +1,2 @@ +export { generateMetadata } from "../../../workers/[roleId]/compare/page" +export { default } from "../../../workers/[roleId]/compare/page" diff --git a/apps/web-roo-code/src/app/evals/recommendations/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/recommendations/[roleId]/page.tsx new file mode 100644 index 00000000000..e40887c5bfe --- /dev/null +++ b/apps/web-roo-code/src/app/evals/recommendations/[roleId]/page.tsx @@ -0,0 +1,2 @@ +export { generateMetadata } from "../../workers/[roleId]/page" +export { default } from "../../workers/[roleId]/page" diff --git a/apps/web-roo-code/src/app/evals/recommendations/page.tsx b/apps/web-roo-code/src/app/evals/recommendations/page.tsx new file mode 100644 index 00000000000..6bf7b6c5e8b --- /dev/null +++ b/apps/web-roo-code/src/app/evals/recommendations/page.tsx @@ -0,0 +1,94 @@ +import type { Metadata } from "next" +import { Fraunces, IBM_Plex_Sans } from "next/font/google" + +import { SEO } from "@/lib/seo" +import { ogImageUrl } from "@/lib/og" +import { getEngineerRoles, getAllRecommendations } from "@/lib/mock-recommendations" + +import { WorkersContent } from "../workers/workers-content" + +// ── SEO Metadata ──────────────────────────────────────────────────────────── + +const TITLE = "Build with Roo Code Cloud | Roo Code Evals" +const DESCRIPTION = + "Outcome-first, eval-backed recommendations for shipping production code. Start from your objective and pick a tradeoff." +const OG_DESCRIPTION = "Outcome-first recommendations for shipping production code" +const PATH = "/evals/recommendations" + +const display = Fraunces({ subsets: ["latin"], variable: "--font-display" }) +const body = IBM_Plex_Sans({ subsets: ["latin"], weight: ["400", "500", "600"], variable: "--font-body" }) + +export const metadata: Metadata = { + title: TITLE, + description: DESCRIPTION, + alternates: { + canonical: `${SEO.url}${PATH}`, + }, + openGraph: { + title: TITLE, + description: DESCRIPTION, + url: `${SEO.url}${PATH}`, + siteName: SEO.name, + images: [ + { + url: ogImageUrl(TITLE, OG_DESCRIPTION), + width: 1200, + height: 630, + alt: TITLE, + }, + ], + locale: SEO.locale, + type: "website", + }, + twitter: { + card: SEO.twitterCard, + title: TITLE, + description: DESCRIPTION, + images: [ogImageUrl(TITLE, OG_DESCRIPTION)], + }, + keywords: [ + ...SEO.keywords, + "AI coding", + "coding agents", + "roo code cloud", + "model recommendations", + "coding evals", + "model comparison", + "shipping code", + "prototype", + ], +} + +// ── Page Component ────────────────────────────────────────────────────────── + +export default function RecommendationsPage() { + const roles = getEngineerRoles() + const recommendations = getAllRecommendations() + + // Aggregate totals + const totalEvalRuns = recommendations.reduce((sum, recommendation) => sum + recommendation.totalEvalRuns, 0) + const totalExercises = recommendations.reduce((sum, recommendation) => sum + recommendation.totalExercises, 0) + const uniqueModels = new Set( + recommendations.flatMap((recommendation) => recommendation.allCandidates.map((candidate) => candidate.modelId)), + ) + const totalModels = uniqueModels.size + + const lastUpdated = recommendations + .map((r) => r.lastUpdated) + .sort() + .pop() + + return ( +
+ +
+ ) +} diff --git a/apps/web-roo-code/src/app/evals/workers-v2/page.tsx b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx index e8559e09ca9..4f067eb2769 100644 --- a/apps/web-roo-code/src/app/evals/workers-v2/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers-v2/page.tsx @@ -8,5 +8,5 @@ type PageProps = { export default async function WorkersV2Page({ searchParams }: PageProps) { const sp = (await searchParams) ?? {} - permanentRedirect(`/evals/workers${buildQueryString(sp)}`) + permanentRedirect(`/evals/recommendations${buildQueryString(sp)}`) } diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx index 0b1105650f1..cda3c52c6eb 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/candidates-content.tsx @@ -660,7 +660,7 @@ export function CandidatesContent({ totalExercises, lastUpdated, cloudUrls, - workersRootPath = "/evals/workers", + workersRootPath = "/evals/recommendations", }: CandidatesContentProps) { const searchParams = useSearchParams() const theme = ROLE_THEMES[roleId] ?? DEFAULT_THEME diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx index 8c005cdbd5f..186ef34c778 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/comparison-chart.tsx @@ -441,7 +441,7 @@ export function ComparisonChart({ recommendation, role, roleId, - workersRootPath = "/evals/workers", + workersRootPath = "/evals/recommendations", }: ComparisonChartProps) { const searchParams = useSearchParams() const { allCandidates } = recommendation diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx index 9d03aa9cca7..ac16a21b56f 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/compare/page.tsx @@ -25,7 +25,7 @@ export async function generateMetadata({ params }: PageProps): Promise const title = `Compare Models — ${role.name} | Roo Code Evals` const description = `Interactive comparison of AI models for the ${role.name} setup. Compare composite score, success rate, cost efficiency, and speed.` const ogDescription = `Compare Models — ${role.name}` - const path = `/evals/workers/${roleId}/compare` + const path = `/evals/recommendations/${roleId}/compare` return { title, @@ -82,7 +82,7 @@ export default async function CompareModelsPage({ params }: PageProps) { recommendation={recommendation} role={recommendation.role} roleId={roleId} - workersRootPath="/evals/workers" + workersRootPath="/evals/recommendations" /> ) } diff --git a/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx index 54ca16d0d26..c408f5098c7 100644 --- a/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx +++ b/apps/web-roo-code/src/app/evals/workers/[roleId]/page.tsx @@ -26,7 +26,7 @@ export async function generateMetadata({ params }: PageProps): Promise const title = `${role.name} — Recommended Models | Roo Code Evals` const description = `Eval-backed recommendations for ${role.name}. Compare models by success rate, cost, and speed across 5 languages.` const ogDescription = `${role.name} — Recommended Models` - const path = `/evals/workers/${roleId}` + const path = `/evals/recommendations/${roleId}` return { title, @@ -100,7 +100,7 @@ export default async function RoleCandidatesPage({ params }: PageProps) { totalExercises={totalExercises} lastUpdated={lastUpdated} cloudUrls={cloudUrls} - workersRootPath="/evals/workers" + workersRootPath="/evals/recommendations" /> ) } diff --git a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx index 9189190120d..21d8646257c 100644 --- a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx @@ -202,7 +202,7 @@ export function WorkersContent({ totalExercises: _totalExercises, totalModels: _totalModels, lastUpdated, - workersRootPath = "/evals/workers", + workersRootPath = "/evals/recommendations", }: WorkersContentProps) { const enableOutcomeLayer = ENABLE_OUTCOME_LAYER const router = useRouter() @@ -214,6 +214,10 @@ export function WorkersContent({ if (!outcome) return null return isEvalOutcomeId(outcome) ? outcome : null }, [searchParams]) + const effectiveOutcomeId = useMemo(() => { + if (selectedOutcomeId) return selectedOutcomeId + return EVAL_OUTCOMES[0]?.id ?? null + }, [selectedOutcomeId]) const selectedMode = useMemo((): EvalOptimizationMode => { const mode = searchParams.get("mode") @@ -255,18 +259,27 @@ export function WorkersContent({ const roleById = useMemo(() => new Map(roles.map((r) => [r.id, r])), [roles]) const selectedOutcome = useMemo(() => { - if (!selectedOutcomeId) return null - return EVAL_OUTCOMES.find((o) => o.id === selectedOutcomeId) ?? null - }, [selectedOutcomeId]) + if (!effectiveOutcomeId) return null + return EVAL_OUTCOMES.find((o) => o.id === effectiveOutcomeId) ?? null + }, [effectiveOutcomeId]) const setupQuery = useMemo(() => { - if (!selectedOutcomeId) return "" + if (!effectiveOutcomeId) return "" const params = new URLSearchParams() - params.set("outcome", selectedOutcomeId) + params.set("outcome", effectiveOutcomeId) params.set("mode", selectedMode) const query = params.toString() return query ? `?${query}` : "" - }, [selectedOutcomeId, selectedMode]) + }, [effectiveOutcomeId, selectedMode]) + + const profileViewQuery = useMemo(() => { + if (!effectiveOutcomeId) return "" + const params = new URLSearchParams() + params.set("outcome", effectiveOutcomeId) + params.set("mode", selectedMode) + params.set("view", "profile") + return `?${params.toString()}` + }, [effectiveOutcomeId, selectedMode]) const isProfileView = useMemo(() => { return searchParams.get("view") === "profile" @@ -277,22 +290,22 @@ export function WorkersContent({ "A default setup built from our eval signals. It’s a baseline, not a guarantee." const profileHowItWorks = selectedOutcome?.builderProfile?.howItWorks ?? selectedOutcome?.whyItWorks ?? [] const objectiveDefaultModel = useMemo(() => { - if (!selectedOutcomeId) return null - return pickObjectiveDefaultModelV1(selectedOutcomeId, selectedMode) - }, [selectedOutcomeId, selectedMode]) + if (!effectiveOutcomeId) return null + return pickObjectiveDefaultModelV1(effectiveOutcomeId, selectedMode) + }, [effectiveOutcomeId, selectedMode]) const objectiveDefaultModelLabel = useMemo(() => { if (!objectiveDefaultModel?.modelId) return "—" return formatModelIdForUi(objectiveDefaultModel.modelId) }, [objectiveDefaultModel]) const examplePrompt = selectedOutcome?.builderProfile?.examplePrompt ?? "" const cloudSetupHref = useMemo(() => { - if (!selectedOutcomeId) return "/cloud-agents/setup" + if (!effectiveOutcomeId) return "/cloud-agents/setup" const params = new URLSearchParams() - params.set("outcome", selectedOutcomeId) + params.set("outcome", effectiveOutcomeId) params.set("mode", selectedMode) if (examplePrompt) params.set("prompt", examplePrompt) return `/cloud-agents/setup?${params.toString()}` - }, [examplePrompt, selectedMode, selectedOutcomeId]) + }, [examplePrompt, selectedMode, effectiveOutcomeId]) const profileCapabilities = useMemo(() => { if (!selectedOutcome) return [] @@ -741,7 +754,7 @@ export function WorkersContent({ {EVAL_OUTCOMES.map((outcome) => { const Icon = outcome.icon - const isSelected = outcome.id === selectedOutcomeId + const isSelected = outcome.id === effectiveOutcomeId return ( Learn more / customize From 268b183dee756271f629b93a30050bb77f151331 Mon Sep 17 00:00:00 2001 From: Michael Preuss Date: Thu, 12 Feb 2026 21:21:14 -0800 Subject: [PATCH 15/22] Add objective deep-dive pages under eval recommendations --- .../[objectiveSlug]/objective-content.tsx | 631 ++++++++++++++++++ .../recommendations/[objectiveSlug]/page.tsx | 102 +++ .../recommendations/[roleId]/compare/page.tsx | 2 - .../evals/recommendations/[roleId]/page.tsx | 2 - .../src/app/evals/recommendations/layout.tsx | 9 + .../src/app/evals/recommendations/page.tsx | 47 +- .../roles/[roleId]/compare/page.tsx | 85 +++ .../recommendations/roles/[roleId]/page.tsx | 101 +++ .../workers/[roleId]/candidates-content.tsx | 31 +- .../[roleId]/compare/comparison-chart.tsx | 31 +- .../evals/workers/[roleId]/compare/page.tsx | 4 +- .../src/app/evals/workers/[roleId]/page.tsx | 4 +- .../src/app/evals/workers/workers-content.tsx | 21 +- apps/web-roo-code/src/lib/eval-outcomes.ts | 12 + 14 files changed, 1026 insertions(+), 56 deletions(-) create mode 100644 apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/objective-content.tsx create mode 100644 apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/page.tsx delete mode 100644 apps/web-roo-code/src/app/evals/recommendations/[roleId]/compare/page.tsx delete mode 100644 apps/web-roo-code/src/app/evals/recommendations/[roleId]/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/recommendations/layout.tsx create mode 100644 apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/compare/page.tsx create mode 100644 apps/web-roo-code/src/app/evals/recommendations/roles/[roleId]/page.tsx diff --git a/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/objective-content.tsx b/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/objective-content.tsx new file mode 100644 index 00000000000..2ceea57a0f9 --- /dev/null +++ b/apps/web-roo-code/src/app/evals/recommendations/[objectiveSlug]/objective-content.tsx @@ -0,0 +1,631 @@ +"use client" + +import { useCallback, useMemo, useState } from "react" +import Link from "next/link" +import { useSearchParams } from "next/navigation" +import { motion } from "framer-motion" +import { ArrowRight, BarChart3, Copy, ExternalLink, SlidersHorizontal, Sparkles, Workflow } from "lucide-react" +import { Cell, ResponsiveContainer, Scatter, ScatterChart, Tooltip, XAxis, YAxis, ZAxis } from "recharts" + +import type { RoleRecommendation, ModelCandidate } from "@/lib/mock-recommendations" +import { getCloudSetupUrl } from "@/lib/mock-recommendations" + +type EvalOptimizationMode = "best" | "fastest" | "cost" + +type ObjectiveDeepDive = { + id: string + slug: string + name: string + description: string + whyItWorks: string[] + recommendedRoleIds: string[] + builderProfile?: { + title: string + description: string + examplePrompt?: string + howItWorks: string[] + } +} + +type ObjectiveRoleRec = { + roleId: string + recommendation: RoleRecommendation +} + +type Props = { + objective: ObjectiveDeepDive + initialMode: EvalOptimizationMode + recs: ObjectiveRoleRec[] +} + +const containerVariants = { + hidden: { opacity: 0 }, + visible: { + opacity: 1, + transition: { staggerChildren: 0.08, delayChildren: 0.08 }, + }, +} + +const fadeUp = { + hidden: { opacity: 0, y: 14 }, + visible: { opacity: 1, y: 0, transition: { duration: 0.55, ease: [0.21, 0.45, 0.27, 0.9] as const } }, +} + +function isEvalOptimizationMode(value: string): value is EvalOptimizationMode { + return value === "best" || value === "fastest" || value === "cost" +} + +function pickCandidate(rec: RoleRecommendation | undefined, mode: EvalOptimizationMode): ModelCandidate | null { + if (!rec) return null + if (mode === "fastest") return rec.speedHire ?? rec.best[0] ?? null + if (mode === "cost") return rec.budgetHire ?? rec.best[0] ?? null + return rec.best[0] ?? null +} + +function shortProvider(provider: string) { + switch (provider) { + case "openai": + return "OpenAI" + case "anthropic": + return "Anthropic" + case "google": + return "Google" + case "xai": + return "xAI" + case "deepseek": + return "DeepSeek" + case "moonshot": + return "Moonshot" + default: + return provider + } +} + +function formatDollars(value: number) { + if (!Number.isFinite(value)) return "—" + return `$${Math.round(value)}` +} + +function formatSeconds(value: number) { + if (!Number.isFinite(value)) return "—" + return `${value.toFixed(1)}s` +} + +function buildObjectiveQueryString( + searchParams: { get(name: string): string | null }, + objectiveSlug: string, + mode: EvalOptimizationMode, +) { + const params = new URLSearchParams() + params.set("objective", objectiveSlug) + params.set("mode", mode) + + // Preserve any existing query bits we might add later without breaking URLs. + const view = searchParams.get("view") + if (view) params.set("view", view) + + return `?${params.toString()}` +} + +type ObjectiveTooltipPayloadEntry = { payload?: unknown } + +function ObjectiveTooltip({ active, payload }: { active?: boolean; payload?: ObjectiveTooltipPayloadEntry[] }) { + if (!active || !payload?.length) return null + const entryPayload = payload[0]?.payload + const p = entryPayload as + | { + name?: string + provider?: string + score?: number + dailyCost?: number + successRate?: number + } + | undefined + if (!p) return null + + return ( +
+

{p.name}

+

+ {shortProvider(p.provider ?? "")} · {p.score} score ·{" "} + {p.successRate}% success +

+

+ Est daily cost {formatDollars(p.dailyCost ?? NaN)} +

+
+ ) +} + +export function ObjectiveContent({ objective, initialMode, recs }: Props) { + const searchParams = useSearchParams() + + const recByRole = useMemo(() => new Map(recs.map((r) => [r.roleId, r.recommendation])), [recs]) + const lineup = useMemo(() => objective.recommendedRoleIds.filter((id) => recByRole.has(id)), [objective, recByRole]) + + const [mode, setMode] = useState(() => { + const m = searchParams.get("mode") + if (m && isEvalOptimizationMode(m)) return m + return initialMode + }) + + const [overrides, setOverrides] = useState>({}) + + const selectedByRole = useMemo(() => { + const next = new Map() + for (const roleId of lineup) { + const rec = recByRole.get(roleId) + const overrideModelId = overrides[roleId] + if (overrideModelId && rec) { + next.set(roleId, rec.allCandidates.find((c) => c.modelId === overrideModelId) ?? null) + } else { + next.set(roleId, pickCandidate(rec, mode)) + } + } + return next + }, [lineup, mode, overrides, recByRole]) + + const primaryRoleId = lineup[0] ?? null + const primaryCandidate = primaryRoleId ? (selectedByRole.get(primaryRoleId) ?? null) : null + + const modePill = useMemo( + () => [ + { id: "best" as const, label: "Quality" }, + { id: "fastest" as const, label: "Speed" }, + { id: "cost" as const, label: "Cost" }, + ], + [], + ) + + const cloudHref = useMemo(() => { + if (!primaryCandidate) return "https://app.roocode.com" + return getCloudSetupUrl(primaryCandidate) + }, [primaryCandidate]) + + const examplePrompt = objective.builderProfile?.examplePrompt?.trim() ?? "" + const copyPrompt = useCallback(async () => { + if (!examplePrompt) return + await navigator.clipboard.writeText(examplePrompt) + }, [examplePrompt]) + + const onSelectModel = useCallback((roleId: string, modelId: string) => { + setOverrides((prev) => ({ ...prev, [roleId]: modelId })) + }, []) + + const onSetMode = useCallback((next: EvalOptimizationMode) => { + setMode(next) + }, []) + + const roleQuery = useMemo( + () => buildObjectiveQueryString(searchParams, objective.slug, mode), + [mode, objective.slug, searchParams], + ) + + const primaryScatter = useMemo(() => { + if (!primaryRoleId) return [] + const rec = recByRole.get(primaryRoleId) + if (!rec) return [] + return rec.allCandidates.map((c) => ({ + name: c.displayName, + provider: c.provider, + score: c.compositeScore, + successRate: c.successRate, + dailyCost: Math.round(c.estimatedDailyCost), + dotSize: Math.round(40 + (c.successRate / 100) * 260), + isSelected: primaryCandidate?.modelId === c.modelId, + })) + }, [primaryCandidate?.modelId, primaryRoleId, recByRole]) + + return ( +
+ {/* Atmosphere */} +
diff --git a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx index a10feb25678..5965c7f8ec6 100644 --- a/apps/web-roo-code/src/app/evals/workers/workers-content.tsx +++ b/apps/web-roo-code/src/app/evals/workers/workers-content.tsx @@ -294,6 +294,7 @@ export function WorkersContent({ if (!objectiveDefaultModel?.modelId) return "—" return formatModelIdForUi(objectiveDefaultModel.modelId) }, [objectiveDefaultModel]) + const selectedModeLabel = getModeLabel(selectedMode) const examplePrompt = selectedOutcome?.builderProfile?.examplePrompt ?? "" const cloudSetupHref = useMemo(() => { if (!effectiveOutcomeId) return "/cloud-agents/setup" @@ -515,7 +516,7 @@ export function WorkersContent({ Optimized for

- {getModeLabel(selectedMode)} + {selectedModeLabel}

@@ -780,9 +781,6 @@ export function WorkersContent({ {outcome.description}

- - Optimized for: {getModeLabel(selectedMode)} -
) @@ -811,8 +809,8 @@ export function WorkersContent({
-
-
+
+

Profile snapshot

@@ -830,7 +828,7 @@ export function WorkersContent({
- Optimized for: {getModeLabel(selectedMode)} + Optimized for: {selectedModeLabel}
@@ -963,7 +961,7 @@ export function WorkersContent({ - Learn more / customize + Learn more about this profile