ENH & MTN & FIX

Valentin-Laurent · Valentin-Laurent · commit f883ca90a718 · 2025-07-30T10:57:22.000+02:00
- Fix bentkus_p_value calculation
 - Fix and move higher_is_better logic in the same place
 - Implement unit test for BinaryClassificationRiskControl
 - Fix parametrizing of existing test
diff --git a/mapie/control_risk/p_values.py b/mapie/control_risk/p_values.py
@@ -89,7 +89,7 @@ def compute_hoeffdding_bentkus_p_value(
     )
     factor = 1 if binary else np.e
     bentkus_p_value = factor * binom.cdf(
-        np.ceil(n_obs_repeat * r_hat_repeat), n_obs, alpha_repeat
+        np.ceil(n_obs_repeat * r_hat_repeat), n_obs_repeat, alpha_repeat
     )
     hb_p_value = np.where(
         bentkus_p_value > hoeffding_p_value,
diff --git a/mapie/risk_control.py b/mapie/risk_control.py
@@ -743,8 +743,6 @@ def get_value_and_effective_sample_size(
                     in zip(risk_occurrences, risk_conditions)
                     if risk_condition)
             risk_value = risk_sum / effective_sample_size
-            if self.higher_is_better:
-                risk_value = 1 - risk_value
             return risk_value, effective_sample_size
         return None
 
diff --git a/mapie/risk_control_draft.py b/mapie/risk_control_draft.py
@@ -30,7 +30,7 @@ def __init__(
         self._predict_function = predict_function
         self._risk = risk
         self._best_predict_param_choice = best_predict_param_choice
-        self._alpha = 1 - target_level
+        self._target_level = target_level
         self._delta = 1 - confidence_level
 
         self._thresholds: NDArray[float] = np.linspace(0, 0.99, 100)
@@ -56,9 +56,15 @@ def calibrate(self, X_calibrate: ArrayLike, y_calibrate: ArrayLike) -> None:
             ) for predictions in predictions_per_threshold]
         )
 
+        if self._risk.higher_is_better:
+            risks_and_eff_sizes[:, 0] = 1 - risks_and_eff_sizes[:, 0]
+            alpha = self._target_level
+        else:
+            alpha = 1 - self._target_level
+
         valid_thresholds_index = ltt_procedure(
             risks_and_eff_sizes[:, 0],
-            np.array([self._alpha]),
+            np.array([alpha]),
             self._delta,
             risks_and_eff_sizes[:, 1],
             True,
diff --git a/mapie/tests/test_risk_control.py b/mapie/tests/test_risk_control.py
@@ -11,10 +11,17 @@
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils.validation import check_is_fitted
+from sklearn.metrics import precision_score, recall_score, accuracy_score
 from typing_extensions import TypedDict
 
 from numpy.typing import NDArray
-from mapie.risk_control import PrecisionRecallController
+from mapie.risk_control import (
+    PrecisionRecallController,
+    precision,
+    recall,
+    accuracy,
+    BinaryClassificationRisk,
+)
 
 Params = TypedDict(
     "Params",
@@ -260,7 +267,7 @@ def test_predict_output_shape(
         X,
         alpha=alpha,
         bound=args["bound"],
-        delta=.1
+        delta=delta
     )
     n_alpha = len(alpha) if hasattr(alpha, "__len__") else 1
     assert y_pred.shape == y.shape
@@ -808,3 +815,39 @@ def test_method_none_recall() -> None:
     )
     mapie_clf.fit(X_toy, y_toy)
     assert mapie_clf.method == "crc"
+
+
+# The following test is voluntarily agnostic
+# to the specific binary classification risk control implementation.
+@pytest.mark.parametrize(
+    "risk_instance, metric_func, effective_sample_func",
+    [
+        (precision, precision_score, lambda y_true, y_pred: np.sum(y_pred == 1)),
+        (recall, recall_score, lambda y_true, y_pred: np.sum(y_true == 1)),
+        (accuracy, accuracy_score, lambda y_true, y_pred: len(y_true)),
+    ],
+)
+@pytest.mark.parametrize(
+    "y_true, y_pred",
+    [
+        (np.array([1, 0, 1, 0]), np.array([1, 1, 0, 0])),
+        (np.array([1, 1, 0, 0]), np.array([1, 1, 1, 0])),
+        (np.array([0, 0, 0, 0]), np.array([0, 1, 0, 1])),
+    ],
+)
+def test_binary_classification_risk(
+    risk_instance: BinaryClassificationRisk,
+    metric_func,
+    effective_sample_func,
+    y_true,
+    y_pred
+):
+    result = risk_instance.get_value_and_effective_sample_size(y_true, y_pred)
+    if effective_sample_func(y_true, y_pred) == 0:
+        assert result is None
+    else:
+        value, n = result
+        expected_value = metric_func(y_true, y_pred)
+        expected_n = effective_sample_func(y_true, y_pred)
+        assert np.isclose(value, expected_value)
+        assert n == expected_n

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ def compute_hoeffdding_bentkus_p_value(`
`89`	`89`	`)`
`90`	`90`	`factor = 1 if binary else np.e`
`91`	`91`	`bentkus_p_value = factor * binom.cdf(`
`92`		`- np.ceil(n_obs_repeat * r_hat_repeat), n_obs, alpha_repeat`
	`92`	`+ np.ceil(n_obs_repeat * r_hat_repeat), n_obs_repeat, alpha_repeat`
`93`	`93`	`)`
`94`	`94`	`hb_p_value = np.where(`
`95`	`95`	`bentkus_p_value > hoeffding_p_value,`