google · AhrendsW · Dec 16, 2025 · Dec 17, 2025
diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,7 +14,9 @@
 
 from __future__ import annotations
 
+import re
 from typing import Optional
+import unicodedata
 
 from google.genai import types as genai_types
 from typing_extensions import override
@@ -92,6 +94,63 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
+class _UnicodeTokenizer:
+  """A tokenizer that handles Unicode text for non-Latin scripts.
+
+  The default rouge_scorer tokenizer only works with ASCII characters,
+  returning empty token lists for non-Latin scripts like Thai, Chinese,
+  Arabic, etc. This tokenizer uses Unicode-aware regex to properly
+  tokenize text in any script.
+  """
+
+  def tokenize(self, text: str) -> list[str]:
+    """Tokenizes text using Unicode-aware word boundaries.
+
+    Args:
+        text: The text to tokenize.
+
+    Returns:
+        A list of tokens (words) from the text.
+    """
+    return re.findall(r"\w+", text, re.UNICODE)
+
+
+def _is_latin_script(text: str) -> bool:
+  """Checks if text is primarily Latin script.
+
+  This is used to determine whether to apply English-specific stemming.
+  Latin script includes English, Portuguese, Spanish, French, German, etc.
+  Non-Latin scripts include Thai, Chinese, Arabic, Japanese, Korean, etc.
+
+  Args:
+      text: The text to analyze.
+
+  Returns:
+      True if the text is primarily Latin script, False otherwise.
+  """
+  if not text:
+    return True
+
+  latin_chars = 0
+  letter_chars = 0
+
+  for char in text:
+    # Check if character is a letter (category starts with 'L')
+    if unicodedata.category(char).startswith("L"):
+      letter_chars += 1
+      # Check if it's a Latin character by looking at its Unicode name
+      char_name = unicodedata.name(char, "")
+      if "LATIN" in char_name:
+        latin_chars += 1
+
+  # If no letters found, default to Latin (likely punctuation/numbers only)
+  if letter_chars == 0:
+    return True
+
+  # Consider text as Latin if more than 50% of letters are Latin
+  return latin_chars / letter_chars > 0.5
+
+
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
 
@@ -103,14 +162,32 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   candidate.
   - F-measure: The harmonic mean of precision and recall.
 
+  Stemming is only applied for Latin script text (English, Portuguese, etc.)
+  since the Porter stemmer only works correctly for English. For non-Latin
+  scripts (Thai, Chinese, Arabic, etc.), stemming is disabled to ensure
+  accurate matching.
+
   Args:
       candidate: The generated text to be evaluated.
       reference: The ground-truth text to compare against.
 
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  # Check if both texts are Latin script
+  is_latin = _is_latin_script(candidate) and _is_latin_script(reference)
+
+  # For Latin scripts (English, Portuguese, etc.): use default tokenizer with
+  # stemmer. For non-Latin scripts (Thai, Chinese, Arabic, etc.): use custom
+  # Unicode tokenizer without stemmer, since:
+  # 1. Porter stemmer only works for English
+  # 2. Default tokenizer doesn't handle Unicode characters properly
+  if is_latin:
+    scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  else:
+    scorer = rouge_scorer.RougeScorer(
+        ["rouge1"], use_stemmer=False, tokenizer=_UnicodeTokenizer()
+    )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.