Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""Graders for AML investigation evaluation."""

from .item import item_level_deterministic_grader
from .run import run_level_grader
from .trace import trace_deterministic_grader


__all__ = ["item_level_deterministic_grader", "run_level_grader", "trace_deterministic_grader"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Shared helpers for AML graders."""

from collections.abc import Mapping
from enum import Enum
from typing import Any

from aieng.agent_evals.aml_investigation.data import LaunderingPattern
from aieng.agent_evals.evaluation import ExperimentItemResult


PATTERN_LABELS: tuple[str, ...] = tuple(pattern.value for pattern in LaunderingPattern)


def get_field(payload: Any, key: str) -> Any:
"""Read ``key`` from dict-like or object payloads."""
if isinstance(payload, Mapping):
return payload.get(key)
return getattr(payload, key, None)


def extract_expected_output(item_result: ExperimentItemResult) -> Any:
"""Extract expected_output from local-dict or dataset-item structures."""
item = item_result.item
if isinstance(item, Mapping):
return item.get("expected_output")
return getattr(item, "expected_output", None)


def normalize_pattern(value: Any) -> str | None:
"""Normalize pattern label to uppercase string form."""
if isinstance(value, Enum):
value = value.value
if value is None:
return None
token = str(value).strip()
return token.upper() if token else None


def normalize_transaction_ids(value: Any) -> set[str]:
"""Normalize transaction IDs into a comparable token set."""
if value is None:
return set()

if isinstance(value, str):
return {token.strip() for token in value.split(",") if token.strip()}

if isinstance(value, list | tuple | set):
normalized: set[str] = set()
for item in value:
if item is None:
continue
token = str(item).strip()
if token:
normalized.add(token)
return normalized

token = str(value).strip()
return {token} if token else set()


__all__ = ["PATTERN_LABELS", "extract_expected_output", "get_field", "normalize_pattern", "normalize_transaction_ids"]
182 changes: 182 additions & 0 deletions aieng-eval-agents/aieng/agent_evals/aml_investigation/graders/item.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""Item-level deterministic graders for AML investigation agent outputs.

This module contains evaluator functions that score one AML case prediction against
the case ground truth and returns a list of per-item metrics suitable for aggregation
at run level.

Examples
--------
>>> from aieng.agent_evals.aml_investigation.graders import (
... item_level_deterministic_grader,
... )
>>> from aieng.agent_evals.aml_investigation.task import AmlInvestigationTask
>>> from aieng.agent_evals.evaluation import run_experiment
>>> task = AmlInvestigationTask()
>>> results = run_experiment(
... # <YOUR_DATASET_NAME>,
... name="aml_item_level_demo",
... task=task,
... evaluators=[item_level_deterministic_grader],
... )
"""

from typing import Any

from aieng.agent_evals.evaluation import Evaluation

from ._common import get_field, normalize_pattern, normalize_transaction_ids


def item_level_deterministic_grader(
input: Any, # noqa: A002
output: Any,
expected_output: Any,
metadata: dict[str, Any] | None = None,
**kwargs: Any,
) -> list[Evaluation]:
"""Evaluate one AML prediction using deterministic rules.

Parameters
----------
input : Any
Item input payload. Included for evaluator interface compatibility and
not used directly.
output : Any
Model output payload. Expected to contain fields such as
``is_laundering``, ``pattern_type``, and ``flagged_transaction_ids``.
expected_output : Any
Ground-truth payload. Expected to contain fields such as
``is_laundering``, ``pattern_type``, and ``attempt_transaction_ids``.
metadata : dict[str, Any] | None, optional
Optional item metadata from the dataset. Not used by this grader.
**kwargs : Any
Additional evaluator kwargs. Ignored by this grader.

Returns
-------
list[Evaluation]
Deterministic per-item metrics, including:
``is_laundering_correct``, ``is_laundering_tp/fp/fn/tn``,
``pattern_type_correct``, ``non_laundering_pattern_consistent``,
``non_laundering_flags_empty``, ``id_precision_like``, and
``id_coverage``.

Examples
--------
>>> output = {
... "is_laundering": False,
... "pattern_type": "NONE",
... "flagged_transaction_ids": "",
... }
>>> expected_output = {
... "is_laundering": False,
... "pattern_type": "NONE",
... "attempt_transaction_ids": "",
... }
>>> evaluations = item_level_deterministic_grader(
... input={},
... output=output,
... expected_output=expected_output,
... )
>>> [e.value for e in evaluations if e.name == "non_laundering_flags_empty"][0]
1.0
"""
del input, metadata, kwargs # Unused but part of evaluator interface.

# Evaluate laundering prediction correctness
expected_is_laundering: bool = get_field(expected_output, "is_laundering")
predicted_is_laundering = get_field(output, "is_laundering")
is_laundering_correct = predicted_is_laundering == expected_is_laundering

# Confusion matrix components for is_laundering
is_tp = bool(expected_is_laundering is True and predicted_is_laundering is True)
is_fp = bool(expected_is_laundering is False and predicted_is_laundering is True)
is_fn = bool(expected_is_laundering is True and predicted_is_laundering is False)

# Evaluate pattern type correctness (exact match)
expected_pattern = normalize_pattern(get_field(expected_output, "pattern_type"))
predicted_pattern = normalize_pattern(get_field(output, "pattern_type"))
pattern_type_correct = predicted_pattern == expected_pattern

# Evaluate flagged transaction ID predictions
ground_truth_ids = normalize_transaction_ids(get_field(expected_output, "attempt_transaction_ids"))
predicted_ids = normalize_transaction_ids(get_field(output, "flagged_transaction_ids"))

true_positive_ids = ground_truth_ids & predicted_ids
false_positive_ids = predicted_ids - ground_truth_ids
false_negative_ids = ground_truth_ids - predicted_ids

tp_count = len(true_positive_ids)
fp_count = len(false_positive_ids)
fn_count = len(false_negative_ids)
predicted_count = len(predicted_ids)
ground_truth_count = len(ground_truth_ids)

# Precision-like for flagged IDs: of the predicted IDs, how many were correct?
id_precision_like = float(tp_count - fp_count) / float(predicted_count) if predicted_count else 0.0
Copy link

Copilot AI Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The id_precision_like metric calculation (tp_count - fp_count) / predicted_count can produce negative values when false positives exceed true positives. This is unconventional for a precision-like metric and may confuse users interpreting results. Standard precision is tp_count / predicted_count or tp_count / (tp_count + fp_count). Consider using the standard precision formula or renaming this metric to clarify that it's a "net accuracy" score that can be negative.

Suggested change
id_precision_like = float(tp_count - fp_count) / float(predicted_count) if predicted_count else 0.0
id_precision_like = float(tp_count) / float(predicted_count) if predicted_count else 0.0

Copilot uses AI. Check for mistakes.
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is intentional. The range for id_precision_likeis [-1, 1]. -1 means that all the transaction IDs that the agent predicts are wrong. 0 means the agent did not flag any transaction. 1 means that all the transaction IDs that the agent flagged are correct.


# Coverage: of the ground truth IDs, how many were correctly predicted?
id_coverage = float(tp_count) / float(ground_truth_count) if ground_truth_count else 0.0

# Consistency checks for predicted benign cases
# If the agent predicts a case is not laundering, the predicted pattern should
# be "NONE" and no transaction IDs should be flagged.
predicted_benign = predicted_is_laundering is False
predicted_benign_pattern_consistent = (predicted_pattern == "NONE") if predicted_benign else True
predicted_benign_ids_consistent = (predicted_count == 0) if predicted_benign else True

return [
Evaluation(
name="is_laundering_correct",
value=1.0 if is_laundering_correct else 0.0,
metadata={
"expected": expected_is_laundering,
"actual": predicted_is_laundering,
"type": "TP" if is_tp else "FP" if is_fp else "FN" if is_fn else "TN",
},
),
Evaluation(
name="pattern_type_correct",
value=1.0 if pattern_type_correct else 0.0,
metadata={"expected": expected_pattern, "actual": predicted_pattern},
),
Evaluation(
name="non_laundering_pattern_consistent",
value=1.0 if predicted_benign_pattern_consistent else 0.0,
metadata={
"applicable": predicted_benign,
"is_laundering": predicted_is_laundering,
"pattern_type": predicted_pattern,
},
),
Evaluation(
name="non_laundering_flags_empty",
value=1.0 if predicted_benign_ids_consistent else 0.0,
metadata={
"applicable": predicted_benign,
"is_laundering": predicted_is_laundering,
"predicted_flagged_count": predicted_count,
},
),
Evaluation(
name="id_precision_like",
value=id_precision_like,
metadata={
"true_positive_count": tp_count,
"false_positive_count": fp_count,
"predicted_count": predicted_count,
},
),
Evaluation(
name="id_coverage",
value=id_coverage,
metadata={
"true_positive_count": tp_count,
"false_negative_count": fn_count,
"ground_truth_count": ground_truth_count,
},
),
]


__all__ = ["item_level_deterministic_grader"]
Loading