Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 78 additions & 1 deletion src/google/adk/evaluation/final_response_match_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

from __future__ import annotations

import re
from typing import Optional
import unicodedata

from google.genai import types as genai_types
from typing_extensions import override
Expand Down Expand Up @@ -92,6 +94,63 @@ def _get_eval_status(score: float, threshold: float):
return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED


class _UnicodeTokenizer:
"""A tokenizer that handles Unicode text for non-Latin scripts.

The default rouge_scorer tokenizer only works with ASCII characters,
returning empty token lists for non-Latin scripts like Thai, Chinese,
Arabic, etc. This tokenizer uses Unicode-aware regex to properly
tokenize text in any script.
"""

def tokenize(self, text: str) -> list[str]:
"""Tokenizes text using Unicode-aware word boundaries.

Args:
text: The text to tokenize.

Returns:
A list of tokens (words) from the text.
"""
return re.findall(r"\w+", text, re.UNICODE)


def _is_latin_script(text: str) -> bool:
"""Checks if text is primarily Latin script.

This is used to determine whether to apply English-specific stemming.
Latin script includes English, Portuguese, Spanish, French, German, etc.
Non-Latin scripts include Thai, Chinese, Arabic, Japanese, Korean, etc.

Args:
text: The text to analyze.

Returns:
True if the text is primarily Latin script, False otherwise.
"""
if not text:
return True

latin_chars = 0
letter_chars = 0

for char in text:
# Check if character is a letter (category starts with 'L')
if unicodedata.category(char).startswith("L"):
letter_chars += 1
# Check if it's a Latin character by looking at its Unicode name
char_name = unicodedata.name(char, "")
if "LATIN" in char_name:
latin_chars += 1

# If no letters found, default to Latin (likely punctuation/numbers only)
if letter_chars == 0:
return True

# Consider text as Latin if more than 50% of letters are Latin
return latin_chars / letter_chars > 0.5


def _calculate_rouge_1_scores(candidate: str, reference: str):
"""Calculates the ROUGE-1 score between a candidate and reference text.

Expand All @@ -103,14 +162,32 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
candidate.
- F-measure: The harmonic mean of precision and recall.

Stemming is only applied for Latin script text (English, Portuguese, etc.)
since the Porter stemmer only works correctly for English. For non-Latin
scripts (Thai, Chinese, Arabic, etc.), stemming is disabled to ensure
accurate matching.

Args:
candidate: The generated text to be evaluated.
reference: The ground-truth text to compare against.

Returns:
A dictionary containing the ROUGE-1 precision, recall, and f-measure.
"""
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
# Check if both texts are Latin script
is_latin = _is_latin_script(candidate) and _is_latin_script(reference)

# For Latin scripts (English, Portuguese, etc.): use default tokenizer with
# stemmer. For non-Latin scripts (Thai, Chinese, Arabic, etc.): use custom
# Unicode tokenizer without stemmer, since:
# 1. Porter stemmer only works for English
# 2. Default tokenizer doesn't handle Unicode characters properly
if is_latin:
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
else:
scorer = rouge_scorer.RougeScorer(
["rouge1"], use_stemmer=False, tokenizer=_UnicodeTokenizer()
)

# The score method returns a dictionary where keys are the ROUGE types
# and values are Score objects (tuples) with precision, recall, and fmeasure.
Expand Down
Loading