scikit-learn-contrib · glemaitre · Apr 1, 2024 · Apr 1, 2024 · Apr 1, 2024
diff --git a/imblearn/base.py b/imblearn/base.py
@@ -75,8 +75,8 @@ def fit(self, X, y):
             Return the instance itself.
         """
         X, y, _ = self._check_X_y(X, y)
-        self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+        self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
+            self.sampling_strategy, y, self._sampling_type, return_original_counts=True
         )
         return self
 
@@ -105,8 +105,8 @@ def fit_resample(self, X, y):
         arrays_transformer = ArraysTransformer(X, y)
         X, y, binarize_y = self._check_X_y(X, y)
 
-        self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+        self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
+            self.sampling_strategy, y, self._sampling_type, return_original_counts=True
         )
 
         output = self._fit_resample(X, y)
@@ -363,8 +363,8 @@ def fit(self, X, y):
             check_classification_targets(y)
             X, y, _ = self._check_X_y(X, y, accept_sparse=self.accept_sparse)
 
-        self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+        self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
+            self.sampling_strategy, y, self._sampling_type, return_original_counts=True
         )
 
         return self
@@ -396,8 +396,8 @@ def fit_resample(self, X, y):
             check_classification_targets(y)
             X, y, binarize_y = self._check_X_y(X, y, accept_sparse=self.accept_sparse)
 
-        self.sampling_strategy_ = check_sampling_strategy(
-            self.sampling_strategy, y, self._sampling_type
+        self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
+            self.sampling_strategy, y, self._sampling_type, return_original_counts=True
         )
 
         output = self._fit_resample(X, y)

diff --git a/imblearn/ensemble/_bagging.py b/imblearn/ensemble/_bagging.py
@@ -17,6 +17,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils.fixes import parse_version
+from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import check_is_fitted
 
 try:
@@ -35,7 +36,11 @@
 from ..utils._docstring import _n_jobs_docstring, _random_state_docstring
 from ..utils._param_validation import HasMethods, Interval, StrOptions
 from ..utils.fixes import _fit_context
-from ._common import _bagging_parameter_constraints, _estimator_has
+from ._common import (
+    _bagging_parameter_constraints,
+    _estimate_reweighting,
+    _estimator_has,
+)
 
 sklearn_version = parse_version(sklearn.__version__)
 
@@ -121,6 +126,13 @@ class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier):
 
         .. versionadded:: 0.8
 
+    recalibrate : bool, default=False
+        Whether to recalibrate the output of `predict_proba` and `predict_log_proba`
+        using the sampling ratio of the different bootstrap samples. Note that the
+        correction is only working for binary classification.
+
+        .. versionadded:: 0.13
+
     Attributes
     ----------
     estimator_ : estimator
@@ -264,6 +276,7 @@ class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier):
             ],
             "replacement": ["boolean"],
             "sampler": [HasMethods(["fit_resample"]), None],
+            "recalibrate": ["boolean"],
         }
     )
     # TODO: remove when minimum supported version of scikit-learn is 1.4
@@ -287,6 +300,7 @@ def __init__(
         random_state=None,
         verbose=0,
         sampler=None,
+        recalibrate=False,
     ):
         super().__init__(
             n_estimators=n_estimators,
@@ -304,6 +318,7 @@ def __init__(
         self.sampling_strategy = sampling_strategy
         self.replacement = replacement
         self.sampler = sampler
+        self.recalibrate = recalibrate
 
     def _validate_y(self, y):
         y_encoded = super()._validate_y(y)
@@ -371,6 +386,15 @@ def fit(self, X, y):
         """
         # overwrite the base class method by disallowing `sample_weight`
         self._validate_params()
+        if self.recalibrate:
+            # compute the type of target only if we need to recalibrate since this is
+            # potentially costly
+            y_type = type_of_target(y)
+            if y_type != "binary":
+                raise ValueError(
+                    "Only possible to recalibrate the probabilities for binary "
+                    f"classification. Got {y_type} instead."
+                )
         return super().fit(X, y)
 
     def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
@@ -388,6 +412,60 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
         # None.
         return super()._fit(X, y, self.max_samples)
 
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        The predicted class probabilities of an input sample is computed as
+        the mean predicted class probabilities of the base estimators in the
+        ensemble. If base estimators do not implement a ``predict_proba``
+        method, then it resorts to voting and the predicted class probabilities
+        of an input sample represents the proportion of estimators predicting
+        each class.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        proba = super().predict_proba(X)
+
+        if self.recalibrate:
+            weight = _estimate_reweighting([est[0] for est in self.estimators_])
+            proba[:, 1] /= proba[:, 1] + (1 - proba[:, 1]) / weight
+            proba[:, 0] = 1 - proba[:, 1]
+
+        return proba
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        The predicted class log-probabilities of an input sample is computed as
+        the log of the mean predicted class probabilities of the base
+        estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        # To take into account the calibration correction, we use our implementation
+        # of `predict_proba` and then apply the log.
+        return np.log(self.predict_proba(X))
+
     # TODO: remove when minimum supported version of scikit-learn is 1.1
     @available_if(_estimator_has("decision_function"))
     def decision_function(self, X):

diff --git a/imblearn/ensemble/_common.py b/imblearn/ensemble/_common.py
@@ -1,5 +1,7 @@
+import copy
 from numbers import Integral, Real
 
+import numpy as np
 from sklearn.tree._criterion import Criterion
 
 from ..utils._param_validation import (
@@ -28,6 +30,39 @@ def check(self):
     return check
 
 
+def _estimate_reweighting(samplers):
+    """Estimate the reweighting factor to calibrate the probabilities.
+
+    The reweighting factor is the averaged ratio of the probability of the
+    positive class before and after resampling for all samplers.
+
+    Parameters
+    ----------
+    samplers : list of samplers
+        The list of samplers.
+
+    Returns
+    -------
+    weight : float
+        The reweighting factor.
+    """
+    weights = []
+    for sampler in samplers:
+        # Since the samplers are internally created, we know that we have target encoded
+        # with 0 and 1.
+        p_y_1_original = sampler._original_class_counts[1] / sum(
+            sampler._original_class_counts[k] for k in [0, 1]
+        )
+        resampled_counts = copy.copy(sampler._original_class_counts)
+        resampled_counts.update(sampler.sampling_strategy_)
+        p_y_1_resampled = resampled_counts[1] / sum(resampled_counts[k] for k in [0, 1])
+        weights.append(
+            (p_y_1_original / (1 - p_y_1_original))
+            * ((1 - p_y_1_resampled) / p_y_1_resampled)
+        )
+    return np.mean(weights)
+
+
 _bagging_parameter_constraints = {
     "estimator": [HasMethods(["fit", "predict"]), None],
     "n_estimators": [Interval(Integral, 1, None, closed="left")],

diff --git a/imblearn/ensemble/_easy_ensemble.py b/imblearn/ensemble/_easy_ensemble.py
@@ -17,6 +17,7 @@
 from sklearn.exceptions import NotFittedError
 from sklearn.utils._tags import _safe_tags
 from sklearn.utils.fixes import parse_version
+from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import check_is_fitted
 
 try:
@@ -35,7 +36,11 @@
 from ..utils._docstring import _n_jobs_docstring, _random_state_docstring
 from ..utils._param_validation import Interval, StrOptions
 from ..utils.fixes import _fit_context
-from ._common import _bagging_parameter_constraints, _estimator_has
+from ._common import (
+    _bagging_parameter_constraints,
+    _estimate_reweighting,
+    _estimator_has,
+)
 
 MAX_INT = np.iinfo(np.int32).max
 sklearn_version = parse_version(sklearn.__version__)
@@ -85,6 +90,13 @@ class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier):
     verbose : int, default=0
         Controls the verbosity of the building process.
 
+    recalibrate : bool, default=False
+        Whether to recalibrate the output of `predict_proba` and `predict_log_proba`
+        using the sampling ratio of the different bootstrap samples. Note that the
+        correction is only working for binary classification.
+
+        .. versionadded:: 0.13
+
     Attributes
     ----------
     estimator_ : estimator
@@ -198,6 +210,7 @@ class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier):
                 callable,
             ],
             "replacement": ["boolean"],
+            "recalibrate": ["boolean"],
         }
     )
     # TODO: remove when minimum supported version of scikit-learn is 1.4
@@ -215,6 +228,7 @@ def __init__(
         n_jobs=None,
         random_state=None,
         verbose=0,
+        recalibrate=False,
     ):
         super().__init__(
             n_estimators=n_estimators,
@@ -231,6 +245,7 @@ def __init__(
         self.estimator = estimator
         self.sampling_strategy = sampling_strategy
         self.replacement = replacement
+        self.recalibrate = recalibrate
 
     def _validate_y(self, y):
         y_encoded = super()._validate_y(y)
@@ -294,6 +309,15 @@ def fit(self, X, y):
         """
         self._validate_params()
         # overwrite the base class method by disallowing `sample_weight`
+        if self.recalibrate:
+            # compute the type of target only if we need to recalibrate since this is
+            # potentially costly
+            y_type = type_of_target(y)
+            if y_type != "binary":
+                raise ValueError(
+                    "Only possible to recalibrate the probabilities for binary "
+                    f"classification. Got {y_type} instead."
+                )
         return super().fit(X, y)
 
     def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
@@ -302,6 +326,60 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
         # None.
         return super()._fit(X, y, self.max_samples)
 
+    def predict_proba(self, X):
+        """Predict class probabilities for X.
+
+        The predicted class probabilities of an input sample is computed as
+        the mean predicted class probabilities of the base estimators in the
+        ensemble. If base estimators do not implement a ``predict_proba``
+        method, then it resorts to voting and the predicted class probabilities
+        of an input sample represents the proportion of estimators predicting
+        each class.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        proba = super().predict_proba(X)
+
+        if self.recalibrate:
+            weight = _estimate_reweighting([est[0] for est in self.estimators_])
+            proba[:, 1] /= proba[:, 1] + (1 - proba[:, 1]) / weight
+            proba[:, 0] = 1 - proba[:, 1]
+
+        return proba
+
+    def predict_log_proba(self, X):
+        """Predict class log-probabilities for X.
+
+        The predicted class log-probabilities of an input sample is computed as
+        the log of the mean predicted class probabilities of the base
+        estimators in the ensemble.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Sparse matrices are accepted only if
+            they are supported by the base estimator.
+
+        Returns
+        -------
+        p : ndarray of shape (n_samples, n_classes)
+            The class log-probabilities of the input samples. The order of the
+            classes corresponds to that in the attribute :term:`classes_`.
+        """
+        # To take into account the calibration correction, we use our implementation
+        # of `predict_proba` and then apply the log.
+        return np.log(self.predict_proba(X))
+
     # TODO: remove when minimum supported version of scikit-learn is 1.1
     @available_if(_estimator_has("decision_function"))
     def decision_function(self, X):