Skip to content

Commit 36e45cd

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Enable FinalResponseMatchV2 metric as an experiment
PiperOrigin-RevId: 784346859
1 parent 35de210 commit 36e45cd

File tree

4 files changed

+39
-12
lines changed

4 files changed

+39
-12
lines changed

src/google/adk/cli/cli_eval.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from __future__ import annotations
1616

1717
import importlib.util
18+
import inspect
1819
import json
1920
import logging
2021
import os
@@ -31,6 +32,7 @@
3132
from ..evaluation.eval_metrics import EvalMetric
3233
from ..evaluation.eval_metrics import EvalMetricResult
3334
from ..evaluation.eval_metrics import EvalMetricResultPerInvocation
35+
from ..evaluation.eval_metrics import JudgeModelOptions
3436
from ..evaluation.eval_result import EvalCaseResult
3537
from ..evaluation.evaluator import EvalStatus
3638
from ..evaluation.evaluator import Evaluator
@@ -42,6 +44,7 @@
4244
TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
4345
RESPONSE_MATCH_SCORE_KEY = "response_match_score"
4446
SAFETY_V1_KEY = "safety_v1"
47+
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
4548
# This evaluation is not very stable.
4649
# This is always optional unless explicitly specified.
4750
RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
@@ -191,10 +194,16 @@ async def run_evals(
191194
for eval_metric in eval_metrics:
192195
metric_evaluator = _get_evaluator(eval_metric)
193196

194-
evaluation_result = metric_evaluator.evaluate_invocations(
195-
actual_invocations=inference_result,
196-
expected_invocations=eval_case.conversation,
197-
)
197+
if inspect.iscoroutinefunction(metric_evaluator.evaluate_invocations):
198+
evaluation_result = await metric_evaluator.evaluate_invocations(
199+
actual_invocations=inference_result,
200+
expected_invocations=eval_case.conversation,
201+
)
202+
else:
203+
evaluation_result = metric_evaluator.evaluate_invocations(
204+
actual_invocations=inference_result,
205+
expected_invocations=eval_case.conversation,
206+
)
198207

199208
overall_eval_metric_results.append(
200209
EvalMetricResult(
@@ -260,6 +269,7 @@ async def run_evals(
260269

261270
def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
262271
try:
272+
from ..evaluation.final_response_match_v2 import FinalResponseMatchV2Evaluator
263273
from ..evaluation.response_evaluator import ResponseEvaluator
264274
from ..evaluation.safety_evaluator import SafetyEvaluatorV1
265275
from ..evaluation.trajectory_evaluator import TrajectoryEvaluator
@@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
276286
)
277287
elif eval_metric.metric_name == SAFETY_V1_KEY:
278288
return SafetyEvaluatorV1(eval_metric)
289+
elif eval_metric.metric_name == FINAL_RESPONSE_MATCH_V2:
290+
eval_metric.judge_model_options = JudgeModelOptions()
291+
return FinalResponseMatchV2Evaluator(eval_metric)
279292

280293
raise ValueError(f"Unsupported eval metric: {eval_metric}")

src/google/adk/evaluation/eval_metrics.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ class PrebuiltMetrics(Enum):
3636

3737
RESPONSE_MATCH_SCORE = "response_match_score"
3838

39+
SAFETY_V1 = "safety_v1"
40+
41+
FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
42+
3943

4044
MetricName: TypeAlias = Union[str, PrebuiltMetrics]
4145

src/google/adk/evaluation/final_response_match_v2.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from typing_extensions import override
2222

2323
from ..models.llm_response import LlmResponse
24-
from ..utils.feature_decorator import working_in_progress
24+
from ..utils.feature_decorator import experimental
2525
from .eval_case import Invocation
2626
from .eval_metrics import EvalMetric
2727
from .evaluator import EvalStatus
@@ -125,7 +125,7 @@ def _parse_critique(response: str) -> Label:
125125
return label
126126

127127

128-
@working_in_progress
128+
@experimental
129129
class FinalResponseMatchV2Evaluator(LlmAsJudge):
130130
"""V2 final response match evaluator which uses an LLM to judge responses.
131131

src/google/adk/evaluation/metric_evaluator_registry.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
from .eval_metrics import MetricName
2222
from .eval_metrics import PrebuiltMetrics
2323
from .evaluator import Evaluator
24+
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
2425
from .response_evaluator import ResponseEvaluator
26+
from .safety_evaluator import SafetyEvaluatorV1
2527
from .trajectory_evaluator import TrajectoryEvaluator
2628

2729
logger = logging.getLogger("google_adk." + __name__)
@@ -71,16 +73,24 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
7173
metric_evaluator_registry = MetricEvaluatorRegistry()
7274

7375
metric_evaluator_registry.register_evaluator(
74-
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE,
75-
evaluator=type(TrajectoryEvaluator),
76+
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
77+
evaluator=TrajectoryEvaluator,
7678
)
7779
metric_evaluator_registry.register_evaluator(
78-
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE,
79-
evaluator=type(ResponseEvaluator),
80+
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
81+
evaluator=ResponseEvaluator,
8082
)
8183
metric_evaluator_registry.register_evaluator(
82-
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE,
83-
evaluator=type(ResponseEvaluator),
84+
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
85+
evaluator=ResponseEvaluator,
86+
)
87+
metric_evaluator_registry.register_evaluator(
88+
metric_name=PrebuiltMetrics.SAFETY_V1.value,
89+
evaluator=SafetyEvaluatorV1,
90+
)
91+
metric_evaluator_registry.register_evaluator(
92+
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
93+
evaluator=FinalResponseMatchV2Evaluator,
8494
)
8595

8696
return metric_evaluator_registry

0 commit comments

Comments
 (0)