feat: Enable FinalResponseMatchV2 metric as an experiment

ankursharmas · copybara-github · commit 36e45cdab3bb · 2025-07-17T15:59:47.000-07:00
PiperOrigin-RevId: 784346859
diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import importlib.util
+import inspect
 import json
 import logging
 import os
@@ -31,6 +32,7 @@
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
 from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
+from ..evaluation.eval_metrics import JudgeModelOptions
 from ..evaluation.eval_result import EvalCaseResult
 from ..evaluation.evaluator import EvalStatus
 from ..evaluation.evaluator import Evaluator
@@ -42,6 +44,7 @@
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
 SAFETY_V1_KEY = "safety_v1"
+FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
 # This evaluation is not very stable.
 # This is always optional unless explicitly specified.
 RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
@@ -191,10 +194,16 @@ async def run_evals(
         for eval_metric in eval_metrics:
           metric_evaluator = _get_evaluator(eval_metric)
 
-          evaluation_result = metric_evaluator.evaluate_invocations(
-              actual_invocations=inference_result,
-              expected_invocations=eval_case.conversation,
-          )
+          if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
+            evaluation_result = await metric_evaluator.evaluate_invocations(
+                actual_invocations=inference_result,
+                expected_invocations=eval_case.conversation,
+            )
+          else:
+            evaluation_result = metric_evaluator.evaluate_invocations(
+                actual_invocations=inference_result,
+                expected_invocations=eval_case.conversation,
+            )
 
           overall_eval_metric_results.append(
               EvalMetricResult(
@@ -260,6 +269,7 @@ async def run_evals(
 
 def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
   try:
+    from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
     from ..evaluation.response_evaluator import ResponseEvaluator
     from ..evaluation.safety_evaluator import SafetyEvaluatorV1
     from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
@@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
     )
   elif eval_metric.metric_name == SAFETY_V1_KEY:
     return SafetyEvaluatorV1(eval_metric)
+  elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2:
+    eval_metric.judge_model_options = JudgeModelOptions()
+    return FinalResponseMatchV2Evaluator(eval_metric)
 
   raise ValueError(f"Unsupported eval metric: {eval_metric}")
diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
@@ -36,6 +36,10 @@ class PrebuiltMetrics(Enum):
 
   RESPONSE_MATCH_SCORE = "response_match_score"
 
+  SAFETY_V1 = "safety_v1"
+
+  FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
+
 
 MetricName: TypeAlias = Union[str, PrebuiltMetrics]
 
diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py
@@ -21,7 +21,7 @@
 from typing_extensions import override
 
 from ..models.llm_response import LlmResponse
-from ..utils.feature_decorator import working_in_progress
+from ..utils.feature_decorator import experimental
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
 from .evaluator import EvalStatus
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
   return label
 
 
-@working_in_progress
+@experimental
 class FinalResponseMatchV2Evaluator(LlmAsJudge):
   """V2 final response match evaluator which uses an LLM to judge responses.
 
diff --git a/src/google/adk/evaluation/metric_evaluator_registry.py b/src/google/adk/evaluation/metric_evaluator_registry.py
@@ -21,7 +21,9 @@
 from .eval_metrics import MetricName
 from .eval_metrics import PrebuiltMetrics
 from .evaluator import Evaluator
+from .final_response_match_v2 import FinalResponseMatchV2Evaluator
 from .response_evaluator import ResponseEvaluator
+from .safety_evaluator import SafetyEvaluatorV1
 from .trajectory_evaluator import TrajectoryEvaluator
 
 logger = logging.getLogger("google_adk." + __name__)
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
   metric_evaluator_registry = MetricEvaluatorRegistry()
 
   metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
-      evaluator=type(TrajectoryEvaluator),
+      metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
+      evaluator=TrajectoryEvaluator,
   )
   metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
-      evaluator=type(ResponseEvaluator),
+      metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
+      evaluator=ResponseEvaluator,
   )
   metric_evaluator_registry.register_evaluator(
-      metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
-      evaluator=type(ResponseEvaluator),
+      metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
+      evaluator=ResponseEvaluator,
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.SAFETY_V1.value,
+      evaluator=SafetyEvaluatorV1,
+  )
+  metric_evaluator_registry.register_evaluator(
+      metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
+      evaluator=FinalResponseMatchV2Evaluator,
   )
 
   return metric_evaluator_registry