From 87db8274b4e181a80c1da07c0a5fb63e0325a588 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 8 Feb 2026 16:03:22 +0900 Subject: [PATCH 1/5] feat(evaluation): add optional eval set result persistence to AgentEvaluator Add optional eval result persistence to AgentEvaluator to align programmatic evaluation with existing EvalSetResultsManager workflows. - Extend AgentEvaluator.evaluate() with: - app_name: Optional[str] = None - eval_set_results_manager: Optional[EvalSetResultsManager] = None - Extend AgentEvaluator.evaluate_eval_set() with the same optional parameters. - Persist aggregated EvalCaseResult entries per eval set when a results manager is provided. - Save results before failure assertion so failed runs still leave artifacts. - Add app name resolution logic (explicit app_name first, then derive from agent_module, including ".agent" suffix handling). - Add unit tests covering explicit/derived app_name, save-on-failure behavior, and argument propagation from evaluate() to evaluate_eval_set(). --- src/google/adk/evaluation/agent_evaluator.py | 84 +++++++ .../evaluation/test_agent_evaluator.py | 235 ++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 tests/unittests/evaluation/test_agent_evaluator.py diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index c0fc736340..b960bdaa1f 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -46,6 +46,7 @@ from .eval_metrics import PrebuiltMetrics from .eval_result import EvalCaseResult from .eval_set import EvalSet +from .eval_set_results_manager import EvalSetResultsManager from .eval_sets_manager import EvalSetsManager from .evaluator import EvalStatus from .in_memory_eval_sets_manager import InMemoryEvalSetsManager @@ -112,6 +113,8 @@ async def evaluate_eval_set( eval_config: Optional[EvalConfig] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, + app_name: Optional[str] = None, + eval_set_results_manager: Optional[EvalSetResultsManager] = None, print_detailed_results: bool = True, ): """Evaluates an agent using the given EvalSet. @@ -128,6 +131,10 @@ async def evaluate_eval_set( assessed. agent_name: The name of the agent, if trying to evaluate something other than root agent. If left empty or none, then root agent is evaluated. + app_name: The application name used by eval set results manager while + persisting eval set results. + eval_set_results_manager: Optional manager used to persist the eval set + evaluation result as `*.evalset_result.json`. print_detailed_results: Whether to print detailed results for each metric evaluation. """ @@ -162,6 +169,13 @@ async def evaluate_eval_set( num_runs=num_runs, user_simulator_provider=user_simulator_provider, ) + AgentEvaluator._maybe_save_eval_set_result( + agent_module=agent_module, + app_name=app_name, + eval_set=eval_set, + eval_results_by_eval_id=eval_results_by_eval_id, + eval_set_results_manager=eval_set_results_manager, + ) # Step 2: Post-process the results! @@ -198,6 +212,8 @@ async def evaluate( eval_dataset_file_path_or_dir: str, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, + app_name: Optional[str] = None, + eval_set_results_manager: Optional[EvalSetResultsManager] = None, initial_session_file: Optional[str] = None, print_detailed_results: bool = True, ): @@ -214,6 +230,10 @@ async def evaluate( num_runs: Number of times all entries in the eval dataset should be assessed. agent_name: The name of the agent. + app_name: The application name used by eval set results manager while + persisting eval set results. + eval_set_results_manager: Optional manager used to persist the eval set + evaluation result as `*.evalset_result.json`. initial_session_file: File that contains initial session state that is needed by all the evals in the eval dataset. print_detailed_results: Whether to print detailed results for each metric @@ -244,6 +264,8 @@ async def evaluate( eval_config=eval_config, num_runs=num_runs, agent_name=agent_name, + app_name=app_name, + eval_set_results_manager=eval_set_results_manager, print_detailed_results=print_detailed_results, ) @@ -644,6 +666,68 @@ def _get_eval_metric_results_with_invocation( ) return eval_metric_results + @staticmethod + def _resolve_app_name( + agent_module: str, app_name: Optional[str] = None + ) -> str: + """Returns app_name for storing eval set results.""" + if app_name: + return app_name + + parts = [part for part in agent_module.split(".") if part] + if not parts: + return agent_module + if len(parts) > 1 and parts[-1] == "agent": + return parts[-2] + return parts[-1] + + @staticmethod + def _flatten_eval_results_by_eval_case_order( + eval_set: EvalSet, + eval_results_by_eval_id: dict[str, list[EvalCaseResult]], + ) -> list[EvalCaseResult]: + """Returns eval results flattened in eval case order.""" + flattened_results: list[EvalCaseResult] = [] + seen_eval_ids = set() + for eval_case in eval_set.eval_cases: + eval_results = eval_results_by_eval_id.get(eval_case.eval_id, []) + if eval_results: + flattened_results.extend(eval_results) + seen_eval_ids.add(eval_case.eval_id) + + for eval_id, eval_results in eval_results_by_eval_id.items(): + if eval_id in seen_eval_ids: + continue + flattened_results.extend(eval_results) + + return flattened_results + + @staticmethod + def _maybe_save_eval_set_result( + agent_module: str, + app_name: Optional[str], + eval_set: EvalSet, + eval_results_by_eval_id: dict[str, list[EvalCaseResult]], + eval_set_results_manager: Optional[EvalSetResultsManager], + ) -> None: + """Saves eval set result if manager is provided.""" + if eval_set_results_manager is None: + return + + resolved_app_name = AgentEvaluator._resolve_app_name( + agent_module=agent_module, app_name=app_name + ) + all_eval_case_results = ( + AgentEvaluator._flatten_eval_results_by_eval_case_order( + eval_set=eval_set, eval_results_by_eval_id=eval_results_by_eval_id + ) + ) + eval_set_results_manager.save_eval_set_result( + app_name=resolved_app_name, + eval_set_id=eval_set.eval_set_id, + eval_case_results=all_eval_case_results, + ) + @staticmethod def _process_metrics_and_get_failures( eval_metric_results: dict[str, list[_EvalMetricResultWithInvocation]], diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py new file mode 100644 index 0000000000..a7f3d4dc83 --- /dev/null +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -0,0 +1,235 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from unittest.mock import AsyncMock + +from google.adk.evaluation.agent_evaluator import AgentEvaluator +from google.adk.evaluation.eval_config import EvalConfig +from google.adk.evaluation.eval_set_results_manager import EvalSetResultsManager +import pytest + + +@pytest.mark.asyncio +async def test_evaluate_eval_set_saves_results_with_explicit_app_name(mocker): + eval_set = SimpleNamespace( + eval_set_id='eval_set_1', + eval_cases=[ + SimpleNamespace(eval_id='case_b'), + SimpleNamespace(eval_id='case_a'), + ], + ) + result_a = mocker.Mock(name='result_a') + result_b = mocker.Mock(name='result_b') + result_b_2 = mocker.Mock(name='result_b_2') + eval_results_by_eval_id = { + 'case_a': [result_a], + 'case_b': [result_b, result_b_2], + } + + mocker.patch.object( + AgentEvaluator, + '_get_agent_for_eval', + new=AsyncMock(return_value=mocker.Mock()), + ) + mocker.patch( + 'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config', + return_value=[], + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_results_by_eval_id', + new=AsyncMock(return_value=eval_results_by_eval_id), + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_metric_results_with_invocation', + return_value={}, + ) + mocker.patch.object( + AgentEvaluator, + '_process_metrics_and_get_failures', + return_value=[], + ) + + manager = mocker.create_autospec(EvalSetResultsManager, instance=True) + + await AgentEvaluator.evaluate_eval_set( + agent_module='my.pkg.search_agent', + eval_set=eval_set, + eval_config=EvalConfig(criteria={}), + app_name='custom_app', + eval_set_results_manager=manager, + print_detailed_results=False, + ) + + manager.save_eval_set_result.assert_called_once_with( + app_name='custom_app', + eval_set_id='eval_set_1', + eval_case_results=[result_b, result_b_2, result_a], + ) + + +@pytest.mark.asyncio +async def test_evaluate_eval_set_uses_derived_app_name(mocker): + eval_set = SimpleNamespace( + eval_set_id='eval_set_1', + eval_cases=[SimpleNamespace(eval_id='case_a')], + ) + eval_result = mocker.Mock(name='eval_result') + + mocker.patch.object( + AgentEvaluator, + '_get_agent_for_eval', + new=AsyncMock(return_value=mocker.Mock()), + ) + mocker.patch( + 'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config', + return_value=[], + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_results_by_eval_id', + new=AsyncMock(return_value={'case_a': [eval_result]}), + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_metric_results_with_invocation', + return_value={}, + ) + mocker.patch.object( + AgentEvaluator, + '_process_metrics_and_get_failures', + return_value=[], + ) + + manager = mocker.create_autospec(EvalSetResultsManager, instance=True) + + await AgentEvaluator.evaluate_eval_set( + agent_module='pkg.search_agent.agent', + eval_set=eval_set, + eval_config=EvalConfig(criteria={}), + eval_set_results_manager=manager, + print_detailed_results=False, + ) + + manager.save_eval_set_result.assert_called_once_with( + app_name='search_agent', + eval_set_id='eval_set_1', + eval_case_results=[eval_result], + ) + + +@pytest.mark.asyncio +async def test_evaluate_eval_set_saves_before_assert_failure(mocker): + eval_set = SimpleNamespace( + eval_set_id='eval_set_1', + eval_cases=[SimpleNamespace(eval_id='case_a')], + ) + eval_result = mocker.Mock(name='eval_result') + + mocker.patch.object( + AgentEvaluator, + '_get_agent_for_eval', + new=AsyncMock(return_value=mocker.Mock()), + ) + mocker.patch( + 'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config', + return_value=[], + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_results_by_eval_id', + new=AsyncMock(return_value={'case_a': [eval_result]}), + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_metric_results_with_invocation', + return_value={}, + ) + mocker.patch.object( + AgentEvaluator, + '_process_metrics_and_get_failures', + return_value=['failed'], + ) + + manager = mocker.create_autospec(EvalSetResultsManager, instance=True) + + with pytest.raises(AssertionError): + await AgentEvaluator.evaluate_eval_set( + agent_module='pkg.search_agent', + eval_set=eval_set, + eval_config=EvalConfig(criteria={}), + eval_set_results_manager=manager, + print_detailed_results=False, + ) + + manager.save_eval_set_result.assert_called_once_with( + app_name='search_agent', + eval_set_id='eval_set_1', + eval_case_results=[eval_result], + ) + + +@pytest.mark.asyncio +async def test_evaluate_passes_results_manager_and_app_name(mocker, tmp_path): + test_dir = tmp_path / 'evals' + nested_dir = test_dir / 'nested' + nested_dir.mkdir(parents=True) + + test_file_1 = test_dir / 'a.test.json' + test_file_2 = nested_dir / 'b.test.json' + test_file_1.write_text('[]', encoding='utf-8') + test_file_2.write_text('[]', encoding='utf-8') + + eval_config = EvalConfig(criteria={}) + eval_set = SimpleNamespace(eval_set_id='eval_set_1') + + mocker.patch.object( + AgentEvaluator, 'find_config_for_test_file', return_value=eval_config + ) + mocker.patch.object( + AgentEvaluator, + '_load_eval_set_from_file', + return_value=eval_set, + ) + evaluate_eval_set_mock = mocker.patch.object( + AgentEvaluator, + 'evaluate_eval_set', + new=AsyncMock(), + ) + + manager = mocker.create_autospec(EvalSetResultsManager, instance=True) + + await AgentEvaluator.evaluate( + agent_module='pkg.search_agent', + eval_dataset_file_path_or_dir=str(test_dir), + app_name='custom_app', + eval_set_results_manager=manager, + print_detailed_results=False, + ) + + assert evaluate_eval_set_mock.await_count == 2 + for await_call in evaluate_eval_set_mock.await_args_list: + assert await_call.kwargs['app_name'] == 'custom_app' + assert await_call.kwargs['eval_set_results_manager'] is manager + + called_paths = { + Path(call.args[0]) + for call in AgentEvaluator.find_config_for_test_file.call_args_list + } + assert called_paths == {test_file_1, test_file_2} From ee3a4549c4ce808074944dd32ed3bbfa601f3347 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 8 Feb 2026 16:10:30 +0900 Subject: [PATCH 2/5] test(integration): add AgentEvaluator persistence example Add an integration test that demonstrates how to persist eval set results from AgentEvaluator.evaluate() without explicitly passing app_name. - Use LocalEvalSetResultsManager with pytest tmp_path as agents_dir. - Call AgentEvaluator.evaluate() with eval_set_results_manager only. - Verify that an eval set result file is created under: /home_automation_agent/.adk/eval_history/*.evalset_result.json This serves as a usage example and verifies derived app_name behavior in an end-to-end evaluation flow. --- tests/integration/test_with_test_file.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py index eed2a2d732..22ca98ffaa 100644 --- a/tests/integration/test_with_test_file.py +++ b/tests/integration/test_with_test_file.py @@ -13,6 +13,7 @@ # limitations under the License. from google.adk.evaluation.agent_evaluator import AgentEvaluator +from google.adk.evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager import pytest @@ -35,3 +36,27 @@ async def test_with_folder_of_test_files_long_running(): ), num_runs=4, ) + + +@pytest.mark.asyncio +async def test_with_single_test_file_saves_eval_set_result( + tmp_path, +): + """Persists eval set results with derived app_name when app_name is omitted.""" + eval_set_results_manager = LocalEvalSetResultsManager( + agents_dir=str(tmp_path) + ) + await AgentEvaluator.evaluate( + agent_module="tests.integration.fixture.home_automation_agent", + eval_dataset_file_path_or_dir=( + "tests/integration/fixture/home_automation_agent/simple_test.test.json" + ), + eval_set_results_manager=eval_set_results_manager, + ) + + saved_result_files = list( + (tmp_path / "home_automation_agent" / ".adk" / "eval_history").glob( + "*.evalset_result.json" + ) + ) + assert saved_result_files From e5f905420e14a992b0c29f42c3bb4cf6e4c5f828 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 8 Feb 2026 16:18:30 +0900 Subject: [PATCH 3/5] fix(evaluation): preserve AgentEvaluator positional-arg compatibility Move newly added optional parameters (`app_name`, `eval_set_results_manager`) to the end of public AgentEvaluator method signatures to avoid breaking existing positional-argument callers. - Keep backward compatibility for: - AgentEvaluator.evaluate_eval_set(...) - AgentEvaluator.evaluate(...) - Add regression tests covering positional argument behavior for: - `print_detailed_results` in evaluate_eval_set - `initial_session_file` and `print_detailed_results` in evaluate --- src/google/adk/evaluation/agent_evaluator.py | 6 +- .../evaluation/test_agent_evaluator.py | 90 +++++++++++++++++++ 2 files changed, 93 insertions(+), 3 deletions(-) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index b960bdaa1f..687ef08ded 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -113,9 +113,9 @@ async def evaluate_eval_set( eval_config: Optional[EvalConfig] = None, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, + print_detailed_results: bool = True, app_name: Optional[str] = None, eval_set_results_manager: Optional[EvalSetResultsManager] = None, - print_detailed_results: bool = True, ): """Evaluates an agent using the given EvalSet. @@ -212,10 +212,10 @@ async def evaluate( eval_dataset_file_path_or_dir: str, num_runs: int = NUM_RUNS, agent_name: Optional[str] = None, - app_name: Optional[str] = None, - eval_set_results_manager: Optional[EvalSetResultsManager] = None, initial_session_file: Optional[str] = None, print_detailed_results: bool = True, + app_name: Optional[str] = None, + eval_set_results_manager: Optional[EvalSetResultsManager] = None, ): """Evaluates an Agent given eval data. diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py index a7f3d4dc83..387a1d0791 100644 --- a/tests/unittests/evaluation/test_agent_evaluator.py +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -233,3 +233,93 @@ async def test_evaluate_passes_results_manager_and_app_name(mocker, tmp_path): for call in AgentEvaluator.find_config_for_test_file.call_args_list } assert called_paths == {test_file_1, test_file_2} + + +@pytest.mark.asyncio +async def test_evaluate_eval_set_keeps_positional_print_detailed_results( + mocker, +): + eval_set = SimpleNamespace( + eval_set_id='eval_set_1', + eval_cases=[SimpleNamespace(eval_id='case_a')], + ) + eval_result = mocker.Mock(name='eval_result') + + mocker.patch.object( + AgentEvaluator, + '_get_agent_for_eval', + new=AsyncMock(return_value=mocker.Mock()), + ) + mocker.patch( + 'google.adk.evaluation.agent_evaluator.get_eval_metrics_from_config', + return_value=[], + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_results_by_eval_id', + new=AsyncMock(return_value={'case_a': [eval_result]}), + ) + mocker.patch.object( + AgentEvaluator, + '_get_eval_metric_results_with_invocation', + return_value={}, + ) + process_mock = mocker.patch.object( + AgentEvaluator, + '_process_metrics_and_get_failures', + return_value=[], + ) + + await AgentEvaluator.evaluate_eval_set( + 'pkg.search_agent', + eval_set, + None, + EvalConfig(criteria={}), + 1, + None, + False, + ) + + assert process_mock.call_args.kwargs['print_detailed_results'] is False + + +@pytest.mark.asyncio +async def test_evaluate_keeps_positional_initial_session_file_and_print_flag( + mocker, +): + initial_session_mock = mocker.patch.object( + AgentEvaluator, + '_get_initial_session', + return_value={}, + ) + mocker.patch.object( + AgentEvaluator, + 'find_config_for_test_file', + return_value=EvalConfig(criteria={}), + ) + mocker.patch.object( + AgentEvaluator, + '_load_eval_set_from_file', + return_value=SimpleNamespace(eval_set_id='eval_set_1'), + ) + evaluate_eval_set_mock = mocker.patch.object( + AgentEvaluator, + 'evaluate_eval_set', + new=AsyncMock(), + ) + + await AgentEvaluator.evaluate( + 'pkg.search_agent', + 'some.test.json', + 1, + None, + 'initial.session.json', + False, + ) + + initial_session_mock.assert_called_once_with('initial.session.json') + evaluate_eval_set_mock.assert_awaited_once() + assert ( + evaluate_eval_set_mock.await_args.kwargs['print_detailed_results'] + is False + ) From ceec65ba7171eed3c053e2c3f07bb86d6615ae40 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 8 Feb 2026 16:28:16 +0900 Subject: [PATCH 4/5] fix(evaluation): sort remaining eval ids for deterministic result order --- src/google/adk/evaluation/agent_evaluator.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 687ef08ded..196e2c54af 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -695,10 +695,10 @@ def _flatten_eval_results_by_eval_case_order( flattened_results.extend(eval_results) seen_eval_ids.add(eval_case.eval_id) - for eval_id, eval_results in eval_results_by_eval_id.items(): - if eval_id in seen_eval_ids: - continue - flattened_results.extend(eval_results) + # Sort remaining eval ids for deterministic output across runs. + remaining_eval_ids = sorted(eval_results_by_eval_id.keys() - seen_eval_ids) + for eval_id in remaining_eval_ids: + flattened_results.extend(eval_results_by_eval_id[eval_id]) return flattened_results From 23246b14db740c28756fb6fdcebaa849b43374be Mon Sep 17 00:00:00 2001 From: ftnext Date: Sun, 8 Feb 2026 17:19:41 +0900 Subject: [PATCH 5/5] feat(evaluation): change AgentEvaluator persistence from aggregated to per-case as `adk eval` and Web UI/API --- src/google/adk/evaluation/agent_evaluator.py | 50 ++++++++----------- tests/integration/test_with_test_file.py | 3 +- .../evaluation/test_agent_evaluator.py | 23 +++++++-- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/src/google/adk/evaluation/agent_evaluator.py b/src/google/adk/evaluation/agent_evaluator.py index 196e2c54af..de5d49e68a 100644 --- a/src/google/adk/evaluation/agent_evaluator.py +++ b/src/google/adk/evaluation/agent_evaluator.py @@ -681,27 +681,6 @@ def _resolve_app_name( return parts[-2] return parts[-1] - @staticmethod - def _flatten_eval_results_by_eval_case_order( - eval_set: EvalSet, - eval_results_by_eval_id: dict[str, list[EvalCaseResult]], - ) -> list[EvalCaseResult]: - """Returns eval results flattened in eval case order.""" - flattened_results: list[EvalCaseResult] = [] - seen_eval_ids = set() - for eval_case in eval_set.eval_cases: - eval_results = eval_results_by_eval_id.get(eval_case.eval_id, []) - if eval_results: - flattened_results.extend(eval_results) - seen_eval_ids.add(eval_case.eval_id) - - # Sort remaining eval ids for deterministic output across runs. - remaining_eval_ids = sorted(eval_results_by_eval_id.keys() - seen_eval_ids) - for eval_id in remaining_eval_ids: - flattened_results.extend(eval_results_by_eval_id[eval_id]) - - return flattened_results - @staticmethod def _maybe_save_eval_set_result( agent_module: str, @@ -717,16 +696,27 @@ def _maybe_save_eval_set_result( resolved_app_name = AgentEvaluator._resolve_app_name( agent_module=agent_module, app_name=app_name ) - all_eval_case_results = ( - AgentEvaluator._flatten_eval_results_by_eval_case_order( - eval_set=eval_set, eval_results_by_eval_id=eval_results_by_eval_id + seen_eval_ids = set() + for eval_case in eval_set.eval_cases: + eval_results = eval_results_by_eval_id.get(eval_case.eval_id, []) + for eval_case_result in eval_results: + eval_set_results_manager.save_eval_set_result( + app_name=resolved_app_name, + eval_set_id=eval_set.eval_set_id, + eval_case_results=[eval_case_result], + ) + if eval_results: + seen_eval_ids.add(eval_case.eval_id) + + # Save any remaining eval ids in sorted order for deterministic output. + remaining_eval_ids = sorted(eval_results_by_eval_id.keys() - seen_eval_ids) + for eval_id in remaining_eval_ids: + for eval_case_result in eval_results_by_eval_id[eval_id]: + eval_set_results_manager.save_eval_set_result( + app_name=resolved_app_name, + eval_set_id=eval_set.eval_set_id, + eval_case_results=[eval_case_result], ) - ) - eval_set_results_manager.save_eval_set_result( - app_name=resolved_app_name, - eval_set_id=eval_set.eval_set_id, - eval_case_results=all_eval_case_results, - ) @staticmethod def _process_metrics_and_get_failures( diff --git a/tests/integration/test_with_test_file.py b/tests/integration/test_with_test_file.py index 22ca98ffaa..714a01be91 100644 --- a/tests/integration/test_with_test_file.py +++ b/tests/integration/test_with_test_file.py @@ -51,6 +51,7 @@ async def test_with_single_test_file_saves_eval_set_result( eval_dataset_file_path_or_dir=( "tests/integration/fixture/home_automation_agent/simple_test.test.json" ), + num_runs=2, eval_set_results_manager=eval_set_results_manager, ) @@ -59,4 +60,4 @@ async def test_with_single_test_file_saves_eval_set_result( "*.evalset_result.json" ) ) - assert saved_result_files + assert len(saved_result_files) == 2 diff --git a/tests/unittests/evaluation/test_agent_evaluator.py b/tests/unittests/evaluation/test_agent_evaluator.py index 387a1d0791..dabf010eeb 100644 --- a/tests/unittests/evaluation/test_agent_evaluator.py +++ b/tests/unittests/evaluation/test_agent_evaluator.py @@ -77,11 +77,24 @@ async def test_evaluate_eval_set_saves_results_with_explicit_app_name(mocker): print_detailed_results=False, ) - manager.save_eval_set_result.assert_called_once_with( - app_name='custom_app', - eval_set_id='eval_set_1', - eval_case_results=[result_b, result_b_2, result_a], - ) + assert manager.save_eval_set_result.call_count == 3 + assert manager.save_eval_set_result.call_args_list == [ + mocker.call( + app_name='custom_app', + eval_set_id='eval_set_1', + eval_case_results=[result_b], + ), + mocker.call( + app_name='custom_app', + eval_set_id='eval_set_1', + eval_case_results=[result_b_2], + ), + mocker.call( + app_name='custom_app', + eval_set_id='eval_set_1', + eval_case_results=[result_a], + ), + ] @pytest.mark.asyncio