From 87db8274b4e181a80c1da07c0a5fb63e0325a588 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sun, 8 Feb 2026 16:03:22 +0900
Subject: [PATCH 1/5] feat(evaluation): add optional eval set result
 persistence to AgentEvaluator

Add optional eval result persistence to AgentEvaluator to align programmatic
evaluation with existing EvalSetResultsManager workflows.

- Extend AgentEvaluator.evaluate() with:
  - app_name: Optional[str] = None
  - eval_set_results_manager: Optional[EvalSetResultsManager] = None
- Extend AgentEvaluator.evaluate_eval_set() with the same optional parameters.
- Persist aggregated EvalCaseResult entries per eval set when a results manager is provided.
- Save results before failure assertion so failed runs still leave artifacts.
- Add app name resolution logic (explicit app_name first, then derive from agent_module, including ".agent" suffix handling).
- Add unit tests covering explicit/derived app_name, save-on-failure behavior, and argument propagation from evaluate() to evaluate_eval_set().
---
 src/google/adk/evaluation/agent_evaluator.py  |  84 +++++++
 .../evaluation/test_agent_evaluator.py        | 235 ++++++++++++++++++
 2 files changed, 319 insertions(+)
 create mode 100644 tests/unittests/evaluation/test_agent_evaluator.py

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index c0fc736340..b960bdaa1f 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -46,6 +46,7 @@
 from .eval_metrics import PrebuiltMetrics
 from .eval_result import EvalCaseResult
 from .eval_set import EvalSet
+from .eval_set_results_manager import EvalSetResultsManager
 from .eval_sets_manager import EvalSetsManager
 from .evaluator import EvalStatus
 from .in_memory_eval_sets_manager import InMemoryEvalSetsManager
@@ -112,6 +113,8 @@ async def evaluate_eval_set(
       eval_config: Optional[EvalConfig] = None,
       num_runs: int = NUM_RUNS,
       agent_name: Optional[str] = None,
+      app_name: Optional[str] = None,
+      eval_set_results_manager: Optional[EvalSetResultsManager] = None,
       print_detailed_results: bool = True,
   ):
     """Evaluates an agent using the given EvalSet.
@@ -128,6 +131,10 @@ async def evaluate_eval_set(
         assessed.
       agent_name: The name of the agent, if trying to evaluate something other
         than root agent. If left empty or none, then root agent is evaluated.
+      app_name: The application name used by eval set results manager while
+        persisting eval set results.
+      eval_set_results_manager: Optional manager used to persist the eval set
+        evaluation result as `*.evalset_result.json`.
       print_detailed_results: Whether to print detailed results for each metric
         evaluation.
     """
@@ -162,6 +169,13 @@ async def evaluate_eval_set(
         num_runs=num_runs,
         user_simulator_provider=user_simulator_provider,
     )
+    AgentEvaluator._maybe_save_eval_set_result(
+        agent_module=agent_module,
+        app_name=app_name,
+        eval_set=eval_set,
+        eval_results_by_eval_id=eval_results_by_eval_id,
+        eval_set_results_manager=eval_set_results_manager,
+    )
 
     # Step 2: Post-process the results!
 
@@ -198,6 +212,8 @@ async def evaluate(
       eval_dataset_file_path_or_dir: str,
       num_runs: int = NUM_RUNS,
       agent_name: Optional[str] = None,
+      app_name: Optional[str] = None,
+      eval_set_results_manager: Optional[EvalSetResultsManager] = None,
       initial_session_file: Optional[str] = None,
       print_detailed_results: bool = True,
   ):
@@ -214,6 +230,10 @@ async def evaluate(
       num_runs: Number of times all entries in the eval dataset should be
         assessed.
       agent_name: The name of the agent.
+      app_name: The application name used by eval set results manager while
+        persisting eval set results.
+      eval_set_results_manager: Optional manager used to persist the eval set
+        evaluation result as `*.evalset_result.json`.
       initial_session_file: File that contains initial session state that is
         needed by all the evals in the eval dataset.
       print_detailed_results: Whether to print detailed results for each metric
@@ -244,6 +264,8 @@ async def evaluate(
           eval_config=eval_config,
           num_runs=num_runs,
           agent_name=agent_name,
+          app_name=app_name,
+          eval_set_results_manager=eval_set_results_manager,
           print_detailed_results=print_detailed_results,
       )
 
@@ -644,6 +666,68 @@ def _get_eval_metric_results_with_invocation(
           )
     return eval_metric_results
 
+  @staticmethod
+  def _resolve_app_name(
+      agent_module: str, app_name: Optional[str] = None
+  ) -> str:
+    """Returns app_name for storing eval set results."""
+    if app_name:
+      return app_name
+
+    parts = [part for part in agent_module.split(".") if part]
+    if not parts:
+      return agent_module
+    if len(parts) > 1 and parts[-1] == "agent":
+      return parts[-2]
+    return parts[-1]
+
+  @staticmethod
+  def _flatten_eval_results_by_eval_case_order(
+      eval_set: EvalSet,
+      eval_results_by_eval_id: dict[str, list[EvalCaseResult]],
+  ) -> list[EvalCaseResult]:
+    """Returns eval results flattened in eval case order."""
+    flattened_results: list[EvalCaseResult] = []
+    seen_eval_ids = set()
+    for eval_case in eval_set.eval_cases:
+      eval_results = eval_results_by_eval_id.get(eval_case.eval_id, [])
+      if eval_results:
+        flattened_results.extend(eval_results)
+        seen_eval_ids.add(eval_case.eval_id)
+
+    for eval_id, eval_results in eval_results_by_eval_id.items():
+      if eval_id in seen_eval_ids:
+        continue
+      flattened_results.extend(eval_results)
+
+    return flattened_results
+
+  @staticmethod
+  def _maybe_save_eval_set_result(
+      agent_module: str,
+      app_name: Optional[str],
+      eval_set: EvalSet,
+      eval_results_by_eval_id: dict[str, list[EvalCaseResult]],
+      eval_set_results_manager: Optional[EvalSetResultsManager],
+  ) -> None:
+    """Saves eval set result if manager is provided."""
+    if eval_set_results_manager is None:
+      return
+
+    resolved_app_name = AgentEvaluator._resolve_app_name(
+        agent_module=agent_module, app_name=app_name
+    )
+    all_eval_case_results = (
+        AgentEvaluator._flatten_eval_results_by_eval_case_order(
+            eval_set=eval_set, eval_results_by_eval_id=eval_results_by_eval_id
+        )
+    )
+    eval_set_results_manager.save_eval_set_result(
+        app_name=resolved_app_name,
+        eval_set_id=eval_set.eval_set_id,
+        eval_case_results=all_eval_case_results,
+    )
+
   @staticmethod
   def _process_metrics_and_get_failures(
       eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]],
diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py
new file mode 100644
index 0000000000..a7f3d4dc83
--- /dev/null
+++ b/tests/unittests/evaluation/test_agent_evaluator.py
@@ -0,0 +1,235 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import AsyncMock
+
+from google.adk.evaluation.agent_evaluator import AgentEvaluator
+from google.adk.evaluation.eval_config import EvalConfig
+from google.adk.evaluation.eval_set_results_manager import EvalSetResultsManager
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_evaluate_eval_set_saves_results_with_explicit_app_name(mocker):
+  eval_set = SimpleNamespace(
+      eval_set_id='eval_set_1',
+      eval_cases=[
+          SimpleNamespace(eval_id='case_b'),
+          SimpleNamespace(eval_id='case_a'),
+      ],
+  )
+  result_a = mocker.Mock(name='result_a')
+  result_b = mocker.Mock(name='result_b')
+  result_b_2 = mocker.Mock(name='result_b_2')
+  eval_results_by_eval_id = {
+      'case_a': [result_a],
+      'case_b': [result_b, result_b_2],
+  }
+
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_agent_for_eval',
+      new=AsyncMock(return_value=mocker.Mock()),
+  )
+  mocker.patch(
+      'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config',
+      return_value=[],
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_results_by_eval_id',
+      new=AsyncMock(return_value=eval_results_by_eval_id),
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_metric_results_with_invocation',
+      return_value={},
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_process_metrics_and_get_failures',
+      return_value=[],
+  )
+
+  manager = mocker.create_autospec(EvalSetResultsManager, instance=True)
+
+  await AgentEvaluator.evaluate_eval_set(
+      agent_module='my.pkg.search_agent',
+      eval_set=eval_set,
+      eval_config=EvalConfig(criteria={}),
+      app_name='custom_app',
+      eval_set_results_manager=manager,
+      print_detailed_results=False,
+  )
+
+  manager.save_eval_set_result.assert_called_once_with(
+      app_name='custom_app',
+      eval_set_id='eval_set_1',
+      eval_case_results=[result_b, result_b_2, result_a],
+  )
+
+
+@pytest.mark.asyncio
+async def test_evaluate_eval_set_uses_derived_app_name(mocker):
+  eval_set = SimpleNamespace(
+      eval_set_id='eval_set_1',
+      eval_cases=[SimpleNamespace(eval_id='case_a')],
+  )
+  eval_result = mocker.Mock(name='eval_result')
+
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_agent_for_eval',
+      new=AsyncMock(return_value=mocker.Mock()),
+  )
+  mocker.patch(
+      'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config',
+      return_value=[],
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_results_by_eval_id',
+      new=AsyncMock(return_value={'case_a': [eval_result]}),
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_metric_results_with_invocation',
+      return_value={},
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_process_metrics_and_get_failures',
+      return_value=[],
+  )
+
+  manager = mocker.create_autospec(EvalSetResultsManager, instance=True)
+
+  await AgentEvaluator.evaluate_eval_set(
+      agent_module='pkg.search_agent.agent',
+      eval_set=eval_set,
+      eval_config=EvalConfig(criteria={}),
+      eval_set_results_manager=manager,
+      print_detailed_results=False,
+  )
+
+  manager.save_eval_set_result.assert_called_once_with(
+      app_name='search_agent',
+      eval_set_id='eval_set_1',
+      eval_case_results=[eval_result],
+  )
+
+
+@pytest.mark.asyncio
+async def test_evaluate_eval_set_saves_before_assert_failure(mocker):
+  eval_set = SimpleNamespace(
+      eval_set_id='eval_set_1',
+      eval_cases=[SimpleNamespace(eval_id='case_a')],
+  )
+  eval_result = mocker.Mock(name='eval_result')
+
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_agent_for_eval',
+      new=AsyncMock(return_value=mocker.Mock()),
+  )
+  mocker.patch(
+      'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config',
+      return_value=[],
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_results_by_eval_id',
+      new=AsyncMock(return_value={'case_a': [eval_result]}),
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_metric_results_with_invocation',
+      return_value={},
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_process_metrics_and_get_failures',
+      return_value=['failed'],
+  )
+
+  manager = mocker.create_autospec(EvalSetResultsManager, instance=True)
+
+  with pytest.raises(AssertionError):
+    await AgentEvaluator.evaluate_eval_set(
+        agent_module='pkg.search_agent',
+        eval_set=eval_set,
+        eval_config=EvalConfig(criteria={}),
+        eval_set_results_manager=manager,
+        print_detailed_results=False,
+    )
+
+  manager.save_eval_set_result.assert_called_once_with(
+      app_name='search_agent',
+      eval_set_id='eval_set_1',
+      eval_case_results=[eval_result],
+  )
+
+
+@pytest.mark.asyncio
+async def test_evaluate_passes_results_manager_and_app_name(mocker, tmp_path):
+  test_dir = tmp_path / 'evals'
+  nested_dir = test_dir / 'nested'
+  nested_dir.mkdir(parents=True)
+
+  test_file_1 = test_dir / 'a.test.json'
+  test_file_2 = nested_dir / 'b.test.json'
+  test_file_1.write_text('[]', encoding='utf-8')
+  test_file_2.write_text('[]', encoding='utf-8')
+
+  eval_config = EvalConfig(criteria={})
+  eval_set = SimpleNamespace(eval_set_id='eval_set_1')
+
+  mocker.patch.object(
+      AgentEvaluator, 'find_config_for_test_file', return_value=eval_config
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_load_eval_set_from_file',
+      return_value=eval_set,
+  )
+  evaluate_eval_set_mock = mocker.patch.object(
+      AgentEvaluator,
+      'evaluate_eval_set',
+      new=AsyncMock(),
+  )
+
+  manager = mocker.create_autospec(EvalSetResultsManager, instance=True)
+
+  await AgentEvaluator.evaluate(
+      agent_module='pkg.search_agent',
+      eval_dataset_file_path_or_dir=str(test_dir),
+      app_name='custom_app',
+      eval_set_results_manager=manager,
+      print_detailed_results=False,
+  )
+
+  assert evaluate_eval_set_mock.await_count == 2
+  for await_call in evaluate_eval_set_mock.await_args_list:
+    assert await_call.kwargs['app_name'] == 'custom_app'
+    assert await_call.kwargs['eval_set_results_manager'] is manager
+
+  called_paths = {
+      Path(call.args[0])
+      for call in AgentEvaluator.find_config_for_test_file.call_args_list
+  }
+  assert called_paths == {test_file_1, test_file_2}

From ee3a4549c4ce808074944dd32ed3bbfa601f3347 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sun, 8 Feb 2026 16:10:30 +0900
Subject: [PATCH 2/5] test(integration): add AgentEvaluator persistence example

Add an integration test that demonstrates how to persist eval set results from
AgentEvaluator.evaluate() without explicitly passing app_name.

- Use LocalEvalSetResultsManager with pytest tmp_path as agents_dir.
- Call AgentEvaluator.evaluate() with eval_set_results_manager only.
- Verify that an eval set result file is created under:
  <tmp_path>/home_automation_agent/.adk/eval_history/*.evalset_result.json

This serves as a usage example and verifies derived app_name behavior in an end-to-end evaluation flow.
---
 tests/integration/test_with_test_file.py | 25 ++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py
index eed2a2d732..22ca98ffaa 100644
--- a/tests/integration/test_with_test_file.py
+++ b/tests/integration/test_with_test_file.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from google.adk.evaluation.agent_evaluator import AgentEvaluator
+from google.adk.evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
 import pytest
 
 
@@ -35,3 +36,27 @@ async def test_with_folder_of_test_files_long_running():
       ),
       num_runs=4,
   )
+
+
+@pytest.mark.asyncio
+async def test_with_single_test_file_saves_eval_set_result(
+    tmp_path,
+):
+  """Persists eval set results with derived app_name when app_name is omitted."""
+  eval_set_results_manager = LocalEvalSetResultsManager(
+      agents_dir=str(tmp_path)
+  )
+  await AgentEvaluator.evaluate(
+      agent_module="tests.integration.fixture.home_automation_agent",
+      eval_dataset_file_path_or_dir=(
+          "tests/integration/fixture/home_automation_agent/simple_test.test.json"
+      ),
+      eval_set_results_manager=eval_set_results_manager,
+  )
+
+  saved_result_files = list(
+      (tmp_path / "home_automation_agent" / ".adk" / "eval_history").glob(
+          "*.evalset_result.json"
+      )
+  )
+  assert saved_result_files

From e5f905420e14a992b0c29f42c3bb4cf6e4c5f828 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sun, 8 Feb 2026 16:18:30 +0900
Subject: [PATCH 3/5] fix(evaluation): preserve AgentEvaluator positional-arg
 compatibility

Move newly added optional parameters (`app_name`, `eval_set_results_manager`)
to the end of public AgentEvaluator method signatures to avoid breaking
existing positional-argument callers.

- Keep backward compatibility for:
  - AgentEvaluator.evaluate_eval_set(...)
  - AgentEvaluator.evaluate(...)
- Add regression tests covering positional argument behavior for:
  - `print_detailed_results` in evaluate_eval_set
  - `initial_session_file` and `print_detailed_results` in evaluate
---
 src/google/adk/evaluation/agent_evaluator.py  |  6 +-
 .../evaluation/test_agent_evaluator.py        | 90 +++++++++++++++++++
 2 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index b960bdaa1f..687ef08ded 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -113,9 +113,9 @@ async def evaluate_eval_set(
       eval_config: Optional[EvalConfig] = None,
       num_runs: int = NUM_RUNS,
       agent_name: Optional[str] = None,
+      print_detailed_results: bool = True,
       app_name: Optional[str] = None,
       eval_set_results_manager: Optional[EvalSetResultsManager] = None,
-      print_detailed_results: bool = True,
   ):
     """Evaluates an agent using the given EvalSet.
 
@@ -212,10 +212,10 @@ async def evaluate(
       eval_dataset_file_path_or_dir: str,
       num_runs: int = NUM_RUNS,
       agent_name: Optional[str] = None,
-      app_name: Optional[str] = None,
-      eval_set_results_manager: Optional[EvalSetResultsManager] = None,
       initial_session_file: Optional[str] = None,
       print_detailed_results: bool = True,
+      app_name: Optional[str] = None,
+      eval_set_results_manager: Optional[EvalSetResultsManager] = None,
   ):
     """Evaluates an Agent given eval data.
 
diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py
index a7f3d4dc83..387a1d0791 100644
--- a/tests/unittests/evaluation/test_agent_evaluator.py
+++ b/tests/unittests/evaluation/test_agent_evaluator.py
@@ -233,3 +233,93 @@ async def test_evaluate_passes_results_manager_and_app_name(mocker, tmp_path):
       for call in AgentEvaluator.find_config_for_test_file.call_args_list
   }
   assert called_paths == {test_file_1, test_file_2}
+
+
+@pytest.mark.asyncio
+async def test_evaluate_eval_set_keeps_positional_print_detailed_results(
+    mocker,
+):
+  eval_set = SimpleNamespace(
+      eval_set_id='eval_set_1',
+      eval_cases=[SimpleNamespace(eval_id='case_a')],
+  )
+  eval_result = mocker.Mock(name='eval_result')
+
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_agent_for_eval',
+      new=AsyncMock(return_value=mocker.Mock()),
+  )
+  mocker.patch(
+      'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config',
+      return_value=[],
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_results_by_eval_id',
+      new=AsyncMock(return_value={'case_a': [eval_result]}),
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_get_eval_metric_results_with_invocation',
+      return_value={},
+  )
+  process_mock = mocker.patch.object(
+      AgentEvaluator,
+      '_process_metrics_and_get_failures',
+      return_value=[],
+  )
+
+  await AgentEvaluator.evaluate_eval_set(
+      'pkg.search_agent',
+      eval_set,
+      None,
+      EvalConfig(criteria={}),
+      1,
+      None,
+      False,
+  )
+
+  assert process_mock.call_args.kwargs['print_detailed_results'] is False
+
+
+@pytest.mark.asyncio
+async def test_evaluate_keeps_positional_initial_session_file_and_print_flag(
+    mocker,
+):
+  initial_session_mock = mocker.patch.object(
+      AgentEvaluator,
+      '_get_initial_session',
+      return_value={},
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      'find_config_for_test_file',
+      return_value=EvalConfig(criteria={}),
+  )
+  mocker.patch.object(
+      AgentEvaluator,
+      '_load_eval_set_from_file',
+      return_value=SimpleNamespace(eval_set_id='eval_set_1'),
+  )
+  evaluate_eval_set_mock = mocker.patch.object(
+      AgentEvaluator,
+      'evaluate_eval_set',
+      new=AsyncMock(),
+  )
+
+  await AgentEvaluator.evaluate(
+      'pkg.search_agent',
+      'some.test.json',
+      1,
+      None,
+      'initial.session.json',
+      False,
+  )
+
+  initial_session_mock.assert_called_once_with('initial.session.json')
+  evaluate_eval_set_mock.assert_awaited_once()
+  assert (
+      evaluate_eval_set_mock.await_args.kwargs['print_detailed_results']
+      is False
+  )

From ceec65ba7171eed3c053e2c3f07bb86d6615ae40 Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sun, 8 Feb 2026 16:28:16 +0900
Subject: [PATCH 4/5] fix(evaluation): sort remaining eval ids for
 deterministic result order

---
 src/google/adk/evaluation/agent_evaluator.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index 687ef08ded..196e2c54af 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -695,10 +695,10 @@ def _flatten_eval_results_by_eval_case_order(
         flattened_results.extend(eval_results)
         seen_eval_ids.add(eval_case.eval_id)
 
-    for eval_id, eval_results in eval_results_by_eval_id.items():
-      if eval_id in seen_eval_ids:
-        continue
-      flattened_results.extend(eval_results)
+    # Sort remaining eval ids for deterministic output across runs.
+    remaining_eval_ids = sorted(eval_results_by_eval_id.keys() - seen_eval_ids)
+    for eval_id in remaining_eval_ids:
+      flattened_results.extend(eval_results_by_eval_id[eval_id])
 
     return flattened_results
 

From 23246b14db740c28756fb6fdcebaa849b43374be Mon Sep 17 00:00:00 2001
From: ftnext <takuyafjp+develop@gmail.com>
Date: Sun, 8 Feb 2026 17:19:41 +0900
Subject: [PATCH 5/5] feat(evaluation): change AgentEvaluator persistence from
 aggregated to per-case

as `adk eval` and Web UI/API
---
 src/google/adk/evaluation/agent_evaluator.py  | 50 ++++++++-----------
 tests/integration/test_with_test_file.py      |  3 +-
 .../evaluation/test_agent_evaluator.py        | 23 +++++++--
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py
index 196e2c54af..de5d49e68a 100644
--- a/src/google/adk/evaluation/agent_evaluator.py
+++ b/src/google/adk/evaluation/agent_evaluator.py
@@ -681,27 +681,6 @@ def _resolve_app_name(
       return parts[-2]
     return parts[-1]
 
-  @staticmethod
-  def _flatten_eval_results_by_eval_case_order(
-      eval_set: EvalSet,
-      eval_results_by_eval_id: dict[str, list[EvalCaseResult]],
-  ) -> list[EvalCaseResult]:
-    """Returns eval results flattened in eval case order."""
-    flattened_results: list[EvalCaseResult] = []
-    seen_eval_ids = set()
-    for eval_case in eval_set.eval_cases:
-      eval_results = eval_results_by_eval_id.get(eval_case.eval_id, [])
-      if eval_results:
-        flattened_results.extend(eval_results)
-        seen_eval_ids.add(eval_case.eval_id)
-
-    # Sort remaining eval ids for deterministic output across runs.
-    remaining_eval_ids = sorted(eval_results_by_eval_id.keys() - seen_eval_ids)
-    for eval_id in remaining_eval_ids:
-      flattened_results.extend(eval_results_by_eval_id[eval_id])
-
-    return flattened_results
-
   @staticmethod
   def _maybe_save_eval_set_result(
       agent_module: str,
@@ -717,16 +696,27 @@ def _maybe_save_eval_set_result(
     resolved_app_name = AgentEvaluator._resolve_app_name(
         agent_module=agent_module, app_name=app_name
     )
-    all_eval_case_results = (
-        AgentEvaluator._flatten_eval_results_by_eval_case_order(
-            eval_set=eval_set, eval_results_by_eval_id=eval_results_by_eval_id
+    seen_eval_ids = set()
+    for eval_case in eval_set.eval_cases:
+      eval_results = eval_results_by_eval_id.get(eval_case.eval_id, [])
+      for eval_case_result in eval_results:
+        eval_set_results_manager.save_eval_set_result(
+            app_name=resolved_app_name,
+            eval_set_id=eval_set.eval_set_id,
+            eval_case_results=[eval_case_result],
+        )
+      if eval_results:
+        seen_eval_ids.add(eval_case.eval_id)
+
+    # Save any remaining eval ids in sorted order for deterministic output.
+    remaining_eval_ids = sorted(eval_results_by_eval_id.keys() - seen_eval_ids)
+    for eval_id in remaining_eval_ids:
+      for eval_case_result in eval_results_by_eval_id[eval_id]:
+        eval_set_results_manager.save_eval_set_result(
+            app_name=resolved_app_name,
+            eval_set_id=eval_set.eval_set_id,
+            eval_case_results=[eval_case_result],
         )
-    )
-    eval_set_results_manager.save_eval_set_result(
-        app_name=resolved_app_name,
-        eval_set_id=eval_set.eval_set_id,
-        eval_case_results=all_eval_case_results,
-    )
 
   @staticmethod
   def _process_metrics_and_get_failures(
diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py
index 22ca98ffaa..714a01be91 100644
--- a/tests/integration/test_with_test_file.py
+++ b/tests/integration/test_with_test_file.py
@@ -51,6 +51,7 @@ async def test_with_single_test_file_saves_eval_set_result(
       eval_dataset_file_path_or_dir=(
           "tests/integration/fixture/home_automation_agent/simple_test.test.json"
       ),
+      num_runs=2,
       eval_set_results_manager=eval_set_results_manager,
   )
 
@@ -59,4 +60,4 @@ async def test_with_single_test_file_saves_eval_set_result(
           "*.evalset_result.json"
       )
   )
-  assert saved_result_files
+  assert len(saved_result_files) == 2
diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py
index 387a1d0791..dabf010eeb 100644
--- a/tests/unittests/evaluation/test_agent_evaluator.py
+++ b/tests/unittests/evaluation/test_agent_evaluator.py
@@ -77,11 +77,24 @@ async def test_evaluate_eval_set_saves_results_with_explicit_app_name(mocker):
       print_detailed_results=False,
   )
 
-  manager.save_eval_set_result.assert_called_once_with(
-      app_name='custom_app',
-      eval_set_id='eval_set_1',
-      eval_case_results=[result_b, result_b_2, result_a],
-  )
+  assert manager.save_eval_set_result.call_count == 3
+  assert manager.save_eval_set_result.call_args_list == [
+      mocker.call(
+          app_name='custom_app',
+          eval_set_id='eval_set_1',
+          eval_case_results=[result_b],
+      ),
+      mocker.call(
+          app_name='custom_app',
+          eval_set_id='eval_set_1',
+          eval_case_results=[result_b_2],
+      ),
+      mocker.call(
+          app_name='custom_app',
+          eval_set_id='eval_set_1',
+          eval_case_results=[result_a],
+      ),
+  ]
 
 
 @pytest.mark.asyncio