From 4cbfa15f1839bdc4f4b8426b4ca423e8c3955dda Mon Sep 17 00:00:00 2001
From: Niek Tax <niek@meta.com>
Date: Tue, 31 Mar 2026 11:47:21 -0700
Subject: [PATCH] Fix bugs in multicalibration library (#257)

Summary:

**Bug: Wrong variable validated in `calibration_free_normalized_entropy` (metrics.py:1401)**
The shape check validated `labels` instead of `predicted_scores`. This meant 2D prediction arrays (e.g., multi-class probabilities) were silently accepted, producing incorrect results, while 2D label arrays were incorrectly rejected.

Differential Revision: D98852527
---
 src/mcgrad/metrics.py            |  2 +-
 src/mcgrad/tests/test_metrics.py | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/mcgrad/metrics.py b/src/mcgrad/metrics.py
index 33c46ffb..ae9d7914 100644
--- a/src/mcgrad/metrics.py
+++ b/src/mcgrad/metrics.py
@@ -1398,7 +1398,7 @@ def calibration_free_normalized_entropy(
     :param max_iter: Maximum number of iterations for the calibration adjustment. Defaults to 10000.
     :return: the calibration-free NE.
     """
-    if len(labels.shape) != 1:
+    if len(predicted_scores.shape) != 1:
         raise ValueError("y_pred must be the predicted probability for class 1 only.")
 
     current_calibration = calibration_ratio(labels, predicted_scores, sample_weight)
diff --git a/src/mcgrad/tests/test_metrics.py b/src/mcgrad/tests/test_metrics.py
index c74547ec..c3b3839f 100644
--- a/src/mcgrad/tests/test_metrics.py
+++ b/src/mcgrad/tests/test_metrics.py
@@ -1774,6 +1774,26 @@ def test_calibration_free_normalized_entropy_higher_for_reversed_predictions():
     assert result_bad > result_good
 
 
+def test_calibration_free_normalized_entropy_rejects_2d_predictions():
+    labels = np.array([0, 1, 0, 1])
+    predictions_2d = np.array([[0.2, 0.8], [0.7, 0.3], [0.1, 0.9], [0.6, 0.4]])
+
+    with pytest.raises(ValueError, match="y_pred must be the predicted probability"):
+        metrics.calibration_free_normalized_entropy(
+            labels=labels, predicted_scores=predictions_2d
+        )
+
+
+def test_calibration_free_normalized_entropy_accepts_1d_labels():
+    labels = np.array([0, 1, 0, 1])
+    predictions = np.array([0.2, 0.8, 0.3, 0.7])
+
+    result = metrics.calibration_free_normalized_entropy(
+        labels=labels, predicted_scores=predictions
+    )
+    assert isinstance(result, (float, np.floating))
+
+
 def test_rank_calibration_error_zero_for_perfect_ranking():
     labels = np.array([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
     perfect_predictions = labels * 2.0