Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEA implement a recalibrate parameter for classifier using sampling #1077

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions imblearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def fit(self, X, y):
Return the instance itself.
"""
X, y, _ = self._check_X_y(X, y)
self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type
self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type, return_original_counts=True
)
return self

Expand Down Expand Up @@ -105,8 +105,8 @@ def fit_resample(self, X, y):
arrays_transformer = ArraysTransformer(X, y)
X, y, binarize_y = self._check_X_y(X, y)

self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type
self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type, return_original_counts=True
)

output = self._fit_resample(X, y)
Expand Down Expand Up @@ -363,8 +363,8 @@ def fit(self, X, y):
check_classification_targets(y)
X, y, _ = self._check_X_y(X, y, accept_sparse=self.accept_sparse)

self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type
self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type, return_original_counts=True
)

return self
Expand Down Expand Up @@ -396,8 +396,8 @@ def fit_resample(self, X, y):
check_classification_targets(y)
X, y, binarize_y = self._check_X_y(X, y, accept_sparse=self.accept_sparse)

self.sampling_strategy_ = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type
self.sampling_strategy_, self._original_class_counts = check_sampling_strategy(
self.sampling_strategy, y, self._sampling_type, return_original_counts=True
)

output = self._fit_resample(X, y)
Expand Down
80 changes: 79 additions & 1 deletion imblearn/ensemble/_bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from sklearn.exceptions import NotFittedError
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.fixes import parse_version
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_is_fitted

try:
Expand All @@ -35,7 +36,11 @@
from ..utils._docstring import _n_jobs_docstring, _random_state_docstring
from ..utils._param_validation import HasMethods, Interval, StrOptions
from ..utils.fixes import _fit_context
from ._common import _bagging_parameter_constraints, _estimator_has
from ._common import (
_bagging_parameter_constraints,
_estimate_reweighting,
_estimator_has,
)

sklearn_version = parse_version(sklearn.__version__)

Expand Down Expand Up @@ -121,6 +126,13 @@ class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier):

.. versionadded:: 0.8

recalibrate : bool, default=False
Whether to recalibrate the output of `predict_proba` and `predict_log_proba`
using the sampling ratio of the different bootstrap samples. Note that the
correction is only working for binary classification.

.. versionadded:: 0.13

Attributes
----------
estimator_ : estimator
Expand Down Expand Up @@ -264,6 +276,7 @@ class BalancedBaggingClassifier(_ParamsValidationMixin, BaggingClassifier):
],
"replacement": ["boolean"],
"sampler": [HasMethods(["fit_resample"]), None],
"recalibrate": ["boolean"],
}
)
# TODO: remove when minimum supported version of scikit-learn is 1.4
Expand All @@ -287,6 +300,7 @@ def __init__(
random_state=None,
verbose=0,
sampler=None,
recalibrate=False,
):
super().__init__(
n_estimators=n_estimators,
Expand All @@ -304,6 +318,7 @@ def __init__(
self.sampling_strategy = sampling_strategy
self.replacement = replacement
self.sampler = sampler
self.recalibrate = recalibrate

def _validate_y(self, y):
y_encoded = super()._validate_y(y)
Expand Down Expand Up @@ -371,6 +386,15 @@ def fit(self, X, y):
"""
# overwrite the base class method by disallowing `sample_weight`
self._validate_params()
if self.recalibrate:
# compute the type of target only if we need to recalibrate since this is
# potentially costly
y_type = type_of_target(y)
if y_type != "binary":
raise ValueError(
"Only possible to recalibrate the probabilities for binary "
f"classification. Got {y_type} instead."
)
return super().fit(X, y)

def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
Expand All @@ -388,6 +412,60 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
# None.
return super()._fit(X, y, self.max_samples)

def predict_proba(self, X):
"""Predict class probabilities for X.

The predicted class probabilities of an input sample is computed as
the mean predicted class probabilities of the base estimators in the
ensemble. If base estimators do not implement a ``predict_proba``
method, then it resorts to voting and the predicted class probabilities
of an input sample represents the proportion of estimators predicting
each class.

Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.

Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities of the input samples. The order of the
classes corresponds to that in the attribute :term:`classes_`.
"""
proba = super().predict_proba(X)

if self.recalibrate:
weight = _estimate_reweighting([est[0] for est in self.estimators_])
proba[:, 1] /= proba[:, 1] + (1 - proba[:, 1]) / weight
proba[:, 0] = 1 - proba[:, 1]

return proba

def predict_log_proba(self, X):
"""Predict class log-probabilities for X.

The predicted class log-probabilities of an input sample is computed as
the log of the mean predicted class probabilities of the base
estimators in the ensemble.

Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.

Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class log-probabilities of the input samples. The order of the
classes corresponds to that in the attribute :term:`classes_`.
"""
# To take into account the calibration correction, we use our implementation
# of `predict_proba` and then apply the log.
return np.log(self.predict_proba(X))

# TODO: remove when minimum supported version of scikit-learn is 1.1
@available_if(_estimator_has("decision_function"))
def decision_function(self, X):
Expand Down
35 changes: 35 additions & 0 deletions imblearn/ensemble/_common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import copy
from numbers import Integral, Real

import numpy as np
from sklearn.tree._criterion import Criterion

from ..utils._param_validation import (
Expand Down Expand Up @@ -28,6 +30,39 @@ def check(self):
return check


def _estimate_reweighting(samplers):
"""Estimate the reweighting factor to calibrate the probabilities.

The reweighting factor is the averaged ratio of the probability of the
positive class before and after resampling for all samplers.

Parameters
----------
samplers : list of samplers
The list of samplers.

Returns
-------
weight : float
The reweighting factor.
"""
weights = []
for sampler in samplers:
# Since the samplers are internally created, we know that we have target encoded
# with 0 and 1.
p_y_1_original = sampler._original_class_counts[1] / sum(
sampler._original_class_counts[k] for k in [0, 1]
)
resampled_counts = copy.copy(sampler._original_class_counts)
resampled_counts.update(sampler.sampling_strategy_)
p_y_1_resampled = resampled_counts[1] / sum(resampled_counts[k] for k in [0, 1])
weights.append(
(p_y_1_original / (1 - p_y_1_original))
* ((1 - p_y_1_resampled) / p_y_1_resampled)
)
return np.mean(weights)


_bagging_parameter_constraints = {
"estimator": [HasMethods(["fit", "predict"]), None],
"n_estimators": [Interval(Integral, 1, None, closed="left")],
Expand Down
80 changes: 79 additions & 1 deletion imblearn/ensemble/_easy_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from sklearn.exceptions import NotFittedError
from sklearn.utils._tags import _safe_tags
from sklearn.utils.fixes import parse_version
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_is_fitted

try:
Expand All @@ -35,7 +36,11 @@
from ..utils._docstring import _n_jobs_docstring, _random_state_docstring
from ..utils._param_validation import Interval, StrOptions
from ..utils.fixes import _fit_context
from ._common import _bagging_parameter_constraints, _estimator_has
from ._common import (
_bagging_parameter_constraints,
_estimate_reweighting,
_estimator_has,
)

MAX_INT = np.iinfo(np.int32).max
sklearn_version = parse_version(sklearn.__version__)
Expand Down Expand Up @@ -85,6 +90,13 @@ class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier):
verbose : int, default=0
Controls the verbosity of the building process.

recalibrate : bool, default=False
Whether to recalibrate the output of `predict_proba` and `predict_log_proba`
using the sampling ratio of the different bootstrap samples. Note that the
correction is only working for binary classification.

.. versionadded:: 0.13

Attributes
----------
estimator_ : estimator
Expand Down Expand Up @@ -198,6 +210,7 @@ class EasyEnsembleClassifier(_ParamsValidationMixin, BaggingClassifier):
callable,
],
"replacement": ["boolean"],
"recalibrate": ["boolean"],
}
)
# TODO: remove when minimum supported version of scikit-learn is 1.4
Expand All @@ -215,6 +228,7 @@ def __init__(
n_jobs=None,
random_state=None,
verbose=0,
recalibrate=False,
):
super().__init__(
n_estimators=n_estimators,
Expand All @@ -231,6 +245,7 @@ def __init__(
self.estimator = estimator
self.sampling_strategy = sampling_strategy
self.replacement = replacement
self.recalibrate = recalibrate

def _validate_y(self, y):
y_encoded = super()._validate_y(y)
Expand Down Expand Up @@ -294,6 +309,15 @@ def fit(self, X, y):
"""
self._validate_params()
# overwrite the base class method by disallowing `sample_weight`
if self.recalibrate:
# compute the type of target only if we need to recalibrate since this is
# potentially costly
y_type = type_of_target(y)
if y_type != "binary":
raise ValueError(
"Only possible to recalibrate the probabilities for binary "
f"classification. Got {y_type} instead."
)
return super().fit(X, y)

def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
Expand All @@ -302,6 +326,60 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None):
# None.
return super()._fit(X, y, self.max_samples)

def predict_proba(self, X):
"""Predict class probabilities for X.

The predicted class probabilities of an input sample is computed as
the mean predicted class probabilities of the base estimators in the
ensemble. If base estimators do not implement a ``predict_proba``
method, then it resorts to voting and the predicted class probabilities
of an input sample represents the proportion of estimators predicting
each class.

Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.

Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class probabilities of the input samples. The order of the
classes corresponds to that in the attribute :term:`classes_`.
"""
proba = super().predict_proba(X)

if self.recalibrate:
weight = _estimate_reweighting([est[0] for est in self.estimators_])
proba[:, 1] /= proba[:, 1] + (1 - proba[:, 1]) / weight
proba[:, 0] = 1 - proba[:, 1]

return proba

def predict_log_proba(self, X):
"""Predict class log-probabilities for X.

The predicted class log-probabilities of an input sample is computed as
the log of the mean predicted class probabilities of the base
estimators in the ensemble.

Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The training input samples. Sparse matrices are accepted only if
they are supported by the base estimator.

Returns
-------
p : ndarray of shape (n_samples, n_classes)
The class log-probabilities of the input samples. The order of the
classes corresponds to that in the attribute :term:`classes_`.
"""
# To take into account the calibration correction, we use our implementation
# of `predict_proba` and then apply the log.
return np.log(self.predict_proba(X))

# TODO: remove when minimum supported version of scikit-learn is 1.1
@available_if(_estimator_has("decision_function"))
def decision_function(self, X):
Expand Down
Loading