From 94e14c4cd5abce50959174b794b30bd7d4488cae Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Sun, 29 Nov 2020 17:44:55 +0100 Subject: [PATCH 01/16] add CLARA --- sklearn_extra/cluster/__init__.py | 4 +- sklearn_extra/cluster/_k_medoids.py | 220 +++++++++++++++++- sklearn_extra/cluster/tests/test_k_medoids.py | 13 +- 3 files changed, 232 insertions(+), 5 deletions(-) diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py index 0d4cf43c..426f8b99 100644 --- a/sklearn_extra/cluster/__init__.py +++ b/sklearn_extra/cluster/__init__.py @@ -1,4 +1,4 @@ -from ._k_medoids import KMedoids +from ._k_medoids import KMedoids, CLARA from ._commonnn import commonnn, CommonNNClustering -__all__ = ["KMedoids", "CommonNNClustering", "commonnn"] +__all__ = ["KMedoids", "CLARA", "CommonNNClustering", "commonnn"] diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 18fb987d..ea5adc18 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -39,9 +39,9 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): What distance metric to use. See :func:metrics.pairwise_distances method : {'alternate', 'pam'}, default: 'alternate' - Which algorithm to use. + Which algorithm to use. 'pam' can be more efficient but slower. - init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'build' + init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'heuristic' Specify medoid initialization method. 'random' selects n_clusters elements from the dataset. 'heuristic' picks the n_clusters points with the smallest sum distance to every other point. 'k-medoids++' @@ -229,6 +229,7 @@ def fit(self, X, y=None): # update Djs and Ejs with new medoids Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]] + else: raise ValueError( f"method={self.method} is not supported. Supported methods " @@ -476,3 +477,218 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None): closest_dist_sq = best_dist_sq return centers + + +class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): + """CLARA clustering. + + Read more in the :ref:`User Guide `. + CLARA (Clustering for Large Applications) extends k-medoids approach for a + large number of objects. This algorithm use a sampling approach. + + Parameters + ---------- + n_clusters : int, optional, default: 8 + The number of clusters to form as well as the number of medoids to + generate. + + metric : string, or callable, optional, default: 'euclidean' + What distance metric to use. See :func:metrics.pairwise_distances + + max_iter : int, optional, default : 300 + Specify the maximum number of iterations when fitting PAM. It can be zero in + which case only the initialization is computed. + + random_state : int, RandomState instance or None, optional + Specify random state for the random number generator. Used to + initialise medoids when init='random'. + + Attributes + ---------- + cluster_centers_ : array, shape = (n_clusters, n_features) + or None if metric == 'precomputed' + Cluster centers, i.e. medoids (elements from the original dataset) + + medoid_indices_ : array, shape = (n_clusters,) + The indices of the medoid rows in X + + labels_ : array, shape = (n_samples,) + Labels of each point + + inertia_ : float + Sum of distances of samples to their closest cluster center. + + Examples + -------- + >>> from sklearn_extra.cluster import KMedoids + >>> import numpy as np + + >>> X = np.asarray([[1, 2], [1, 4], [1, 0], + ... [4, 2], [4, 4], [4, 0]]) + >>> clara = CLARA(n_clusters=2, random_state=0).fit(X) + >>> clara.labels_ + array([0, 0, 0, 1, 1, 1]) + >>> clara.predict([[0,0], [4,4]]) + array([0, 1]) + >>> clara.cluster_centers_ + array([[1, 2], + [4, 2]]) + >>> clara.inertia_ + 8.0 + + + References + ---------- + Kaufman, L. and Rousseeuw, P.J. (2008). Partitioning Around Medoids (Program PAM). + In Finding Groups in Data (eds L. Kaufman and P.J. Rousseeuw). + doi:10.1002/9780470316801.ch2 + + See also + -------- + + KMedoids + CLARA is a variant of KMedoids that use sub-sampling scheme as such if the + dataset is sufficiently small, KMedoids is preferable. + + Notes + ----- + Contrary to KMedoids, CLARA is linear in N the sample size for both the spacial + and time complexity. On the other hand, it scales quadratically with sampling_size. + + """ + + def __init__( + self, + n_clusters=8, + metric="euclidean", + init="build", + max_iter=300, + sampling_size=None, + samples=5, + random_state=None, + ): + self.n_clusters = n_clusters + self.metric = metric + self.init = init + self.max_iter = max_iter + self.sampling_size = sampling_size + self.samples = samples + self.random_state = random_state + + def fit(self, X, y=None): + n = len(X) + + if self.sampling_size is None: + sampling_size = min(n, 40 + 2 * self.n_clusters) + else: + sampling_size = self.sampling_size + rng = np.random.RandomState(self.random_state) + medoids_idxs = rng.choice( + np.arange(n), size=self.n_clusters, replace=False + ) + best_score = np.inf + for _ in range(self.samples): + sample_idxs = np.hstack( + [ + medoids_idxs, + rng.choice( + np.delete(np.arange(n), medoids_idxs), + size=sampling_size - self.n_clusters, + replace=False, + ), + ] + ) + pam = KMedoids( + n_clusters=self.n_clusters, + metric=self.metric, + method="pam", + init=self.init, + max_iter=self.max_iter, + random_state=rng, + ) + pam.fit(X[sample_idxs]) + self.cluster_centers_ = pam.cluster_centers_ + self.inertia_ = self._compute_inertia(self.transform(X)) + + if pam.inertia_ < best_score: + best_score = self.inertia_ + medoids_idxs = pam.medoid_indices_ + best_sample_idxs = sample_idxs + + self.medoid_indices_ = medoids_idxs + self.labels_ = np.argmin(self.transform(X), axis=0) + + return self + + def _compute_inertia(self, distances): + """Compute inertia of new samples. Inertia is defined as the sum of the + sample distances to closest cluster centers. + + Parameters + ---------- + distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters) + Distances to cluster centers. + + Returns + ------- + Sum of sample distances to closest cluster centers. + """ + + # Define inertia as the sum of the sample-distances + # to closest cluster centers + inertia = np.sum(np.min(distances, axis=1)) + + return inertia + + def transform(self, X): + """Transforms X to cluster-distance space. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + Data to transform. + + Returns + ------- + X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters) + X transformed in the new space of distances to cluster centers. + """ + X = check_array(X, accept_sparse=["csr", "csc"]) + + if self.metric == "precomputed": + check_is_fitted(self, "medoid_indices_") + return X[:, self.medoid_indices_] + else: + check_is_fitted(self, "cluster_centers_") + + Y = self.cluster_centers_ + return pairwise_distances(X, Y=Y, metric=self.metric) + + def predict(self, X): + """Predict the closest cluster for each sample in X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + New data to predict. + + Returns + ------- + labels : array, shape = (n_query,) + Index of the cluster each sample belongs to. + """ + X = check_array(X, accept_sparse=["csr", "csc"]) + + if self.metric == "precomputed": + check_is_fitted(self, "medoid_indices_") + return np.argmin(X[:, self.medoid_indices_], axis=1) + else: + check_is_fitted(self, "cluster_centers_") + + # Return data points to clusters based on which cluster assignment + # yields the smallest distance + return pairwise_distances_argmin( + X, Y=self.cluster_centers_, metric=self.metric + ) diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index f6854ee5..eaaf0106 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -10,7 +10,7 @@ from sklearn.metrics.pairwise import euclidean_distances from numpy.testing import assert_allclose, assert_array_equal -from sklearn_extra.cluster import KMedoids +from sklearn_extra.cluster import KMedoids, CLARA from sklearn.cluster import KMeans from sklearn.datasets import make_blobs @@ -43,6 +43,17 @@ def test_kmedoid_results(method, init): ) +def test_clara_results(): + expected = np.hstack([np.zeros(50), np.ones(50)]) + km = CLARA(n_clusters=2) + km.fit(X_cc) + # This test use data that are not perfectly separable so the + # accuracy is not 1. Accuracy around 0.85 + assert (np.mean(km.labels_ == expected) > 0.8) or ( + 1 - np.mean(km.labels_ == expected) > 0.8 + ) + + def test_medoids_invalid_method(): with pytest.raises(ValueError, match="invalid is not supported"): KMedoids(n_clusters=1, method="invalid").fit([[0, 1], [1, 1]]) From 51d4034e8945adc10d014c66df4dff673bebe9ea Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Mon, 30 Nov 2020 11:28:33 +0100 Subject: [PATCH 02/16] add example --- examples/plot_clara_digits.py | 121 ++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 examples/plot_clara_digits.py diff --git a/examples/plot_clara_digits.py b/examples/plot_clara_digits.py new file mode 100644 index 00000000..9eb63275 --- /dev/null +++ b/examples/plot_clara_digits.py @@ -0,0 +1,121 @@ +""" +====================================================================== +A demo of K-Medoids vs CLARA clustering on the handwritten digits data +====================================================================== +In this example we compare different computation time of K-Medoids and CLARA on +the handwritten digits data. +""" +import numpy as np +import matplotlib.pyplot as plt +import time + +from sklearn_extra.cluster import KMedoids, CLARA +from sklearn.datasets import load_digits +from sklearn.decomposition import PCA +from sklearn.preprocessing import scale + +print(__doc__) + +# Authors: Timo Erkkilä +# Antti Lehmussola +# Kornel Kiełczewski +# License: BSD 3 clause + +np.random.seed(42) + +digits = load_digits() +data = scale(digits.data) +n_digits = len(np.unique(digits.target)) + +reduced_data = PCA(n_components=2).fit_transform(data) + +# Step size of the mesh. Decrease to increase the quality of the VQ. +h = 0.02 # point in the mesh [x_min, m_max]x[y_min, y_max]. + +# Plot the decision boundary. For that, we will assign a color to each +x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 +y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 +xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) + +plt.figure() +plt.clf() + +plt.suptitle( + "Comparing multiple K-Medoids metrics to K-Means and each other", + fontsize=14, +) + + +selected_models = [ + ( + KMedoids(metric="cosine", n_clusters=n_digits), + "KMedoids (cosine)", + ), + ( + KMedoids(metric="manhattan", n_clusters=n_digits), + "KMedoids (manhattan)", + ), + ( + CLARA( + metric="cosine", + n_clusters=n_digits, + init="heuristic", + sampling_size=50, + ), + "CLARA (cosine)", + ), + ( + CLARA( + metric="manhattan", + n_clusters=n_digits, + init="heuristic", + sampling_size=50, + ), + "CLARA (manhattan)", + ), +] + +plot_rows = int(np.ceil(len(selected_models) / 2.0)) +plot_cols = 2 + +for i, (model, description) in enumerate(selected_models): + + # Obtain labels for each point in mesh. Use last trained model. + init_time = time.time() + model.fit(reduced_data) + Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) + computation_time = time.time() - init_time + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + plt.subplot(plot_cols, plot_rows, i + 1) + plt.imshow( + Z, + interpolation="nearest", + extent=(xx.min(), xx.max(), yy.min(), yy.max()), + cmap=plt.cm.Paired, + aspect="auto", + origin="lower", + ) + + plt.plot( + reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2, alpha=0.3 + ) + # Plot the centroids as a white X + centroids = model.cluster_centers_ + plt.scatter( + centroids[:, 0], + centroids[:, 1], + marker="x", + s=169, + linewidths=3, + color="w", + zorder=10, + ) + plt.title(description + ": %.2Fs" % (computation_time)) + plt.xlim(x_min, x_max) + plt.ylim(y_min, y_max) + plt.xticks(()) + plt.yticks(()) + +plt.show() From 6204cba82fafef12b0cd2893b7c87a19b78ee44f Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Mon, 30 Nov 2020 11:39:17 +0100 Subject: [PATCH 03/16] fix typo --- examples/plot_clara_digits.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_clara_digits.py b/examples/plot_clara_digits.py index 9eb63275..9afcb7c8 100644 --- a/examples/plot_clara_digits.py +++ b/examples/plot_clara_digits.py @@ -41,7 +41,7 @@ plt.clf() plt.suptitle( - "Comparing multiple K-Medoids metrics to K-Means and each other", + "Comparing KMedoids and CLARA", fontsize=14, ) From 61a580c5b028304b80f2f55b07548c71fbaf2d35 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Fri, 18 Dec 2020 19:06:37 +0100 Subject: [PATCH 04/16] add doc --- doc/api.rst | 1 + doc/modules/cluster.rst | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 57b36246..25fc8ed8 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -32,6 +32,7 @@ Clustering cluster.KMedoids cluster.CommonNNClustering + cluster.CLARA Robust ==================== diff --git a/doc/modules/cluster.rst b/doc/modules/cluster.rst index bb351308..d987fb3e 100644 --- a/doc/modules/cluster.rst +++ b/doc/modules/cluster.rst @@ -80,6 +80,38 @@ when speed is an issue. for performing face recognition. International Journal of Soft Computing, Mathematics and Control, 3(3), pp 1-12. + + +CLARA +===== + + :class:`CLARA` is related to the :class:`KMedoids` algorithm. CLARA + (Clustering for Large Applications) extends k-medoids approach for a + large number of objects. This algorithm use a sampling approach. + + .. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_plot_clara_digits.py`: Applying K-Medoids on digits + with various distance metrics. + + + **Algorithm description:** + CLARA use `sample` random samples of the dataset, each of size `sampling_size` + The algorith is iterative, first we select one sub-sample, then CLARA applies + KMedoids on this sub-sample to obtain `n_clusters` medoids. At the next step, + CLARA sample `sampling_size`-`n_clusters` from the dataset and the next sub-sample + is composed of the best medoids found until now (with respect to inertia in the + whole dataset, not the inertia only on the sub-sample) to which we add the new + samples just drawn. Then, K-Medoids is applied to this new sub-sample, and loop + back until `sample` sub-samples have been used. + + + .. topic:: References: + + * Kaufman, L. and Rousseeuw, P.J. (2008). Clustering Large Applications (Program CLARA). + In Finding Groups in Data (eds L. Kaufman and P.J. Rousseeuw). + doi:10.1002/9780470316801.ch2 + .. _commonnn: Common-nearest-neighbors clustering From ebabc8a39a0811b434016a2aad9695fce2823b9b Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Fri, 18 Dec 2020 19:06:48 +0100 Subject: [PATCH 05/16] fix docstring --- sklearn_extra/cluster/_k_medoids.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 8abaa035..5c27bc69 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -520,22 +520,16 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): Examples -------- - >>> from sklearn_extra.cluster import KMedoids + >>> from sklearn_extra.cluster import CLARA >>> import numpy as np - - >>> X = np.asarray([[1, 2], [1, 4], [1, 0], - ... [4, 2], [4, 4], [4, 0]]) + >>> from sklearn.datasets import make_blobs + >>> X, _ = make_blobs(centers=[[0,0],[1,1]],n_samples=100, n_features=2, + random_state=0) >>> clara = CLARA(n_clusters=2, random_state=0).fit(X) - >>> clara.labels_ - array([0, 0, 0, 1, 1, 1]) >>> clara.predict([[0,0], [4,4]]) array([0, 1]) - >>> clara.cluster_centers_ - array([[1, 2], - [4, 2]]) >>> clara.inertia_ - 8.0 - + 122.44919397611667 References ---------- @@ -616,7 +610,7 @@ def fit(self, X, y=None): best_sample_idxs = sample_idxs self.medoid_indices_ = medoids_idxs - self.labels_ = np.argmin(self.transform(X), axis=0) + self.labels_ = np.argmin(self.transform(X), axis=1) return self From 007d0e8e360d1fd565d20c189ccbc4aa41fbf1d5 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Mon, 21 Dec 2020 10:36:41 +0100 Subject: [PATCH 06/16] add CLARA to test_common --- sklearn_extra/cluster/_k_medoids.py | 60 +++++++++++++++++++++++++---- sklearn_extra/tests/test_common.py | 3 +- 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 5c27bc69..cf4b8854 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -200,7 +200,15 @@ def fit(self, X, y=None): if self.method == "pam": # Compute the distance to the first and second closest points # among medoids. - Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]] + if self.n_clusters == 1 and self.max_iter > 0: + # PAM SWAP step can only be used for n_clusters > 1 + warnings.warn( + "n_clusters should be larger than 2 if max_iter != 0 " + "setting max_iter to 0." + ) + self.max_iter = 0 + elif self.max_iter > 0: + Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]] # Continue the algorithm as long as # the medoids keep changing and the maximum number @@ -496,8 +504,17 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): What distance metric to use. See :func:metrics.pairwise_distances max_iter : int, optional, default : 300 - Specify the maximum number of iterations when fitting PAM. It can be zero in - which case only the initialization is computed. + Specify the maximum number of iterations when fitting PAM. It can be zero + in which case only the initialization is computed. + + sampling_size : int or None, optional, default : None + Size of the sampled dataset at each iteration. sampling-size a trade-off + between complexity and efficiency. If None, then sampling-size is set + to min(sample_size, 40 + 2 * self.n_clusters) as suggested by the authors of the + algorithm. must be smaller than sample_size. + + samples : int, optional, default : 5 + Number of different samples that have to be done, or number of iterations. random_state : int, RandomState instance or None, optional Specify random state for the random number generator. Used to @@ -570,14 +587,42 @@ def __init__( self.random_state = random_state def fit(self, X, y=None): + """Fit CLARA to the provided data. + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features), \ + or (n_samples, n_samples) if metric == 'precomputed' + Dataset to cluster. + + y : Ignored + + Returns + ------- + self + """ + X = check_array(X) n = len(X) + random_state_ = check_random_state(self.random_state) + if self.sampling_size is None: sampling_size = min(n, 40 + 2 * self.n_clusters) else: sampling_size = self.sampling_size - rng = np.random.RandomState(self.random_state) - medoids_idxs = rng.choice( + + # Check sampling_size. + if n < sampling_size: + raise ValueError( + "sample_size should be greater than self.sampling_size" + ) + + if n < self.n_clusters: + raise ValueError( + "sample_size should be greater than self.n_clusters" + ) + + medoids_idxs = random_state_.choice( np.arange(n), size=self.n_clusters, replace=False ) best_score = np.inf @@ -585,7 +630,7 @@ def fit(self, X, y=None): sample_idxs = np.hstack( [ medoids_idxs, - rng.choice( + random_state_.choice( np.delete(np.arange(n), medoids_idxs), size=sampling_size - self.n_clusters, replace=False, @@ -598,7 +643,7 @@ def fit(self, X, y=None): method="pam", init=self.init, max_iter=self.max_iter, - random_state=rng, + random_state=random_state_, ) pam.fit(X[sample_idxs]) self.cluster_centers_ = pam.cluster_centers_ @@ -611,6 +656,7 @@ def fit(self, X, y=None): self.medoid_indices_ = medoids_idxs self.labels_ = np.argmin(self.transform(X), axis=1) + self.n_iter_ = self.samples return self diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py index 2da6cf22..3a72dc32 100644 --- a/sklearn_extra/tests/test_common.py +++ b/sklearn_extra/tests/test_common.py @@ -3,7 +3,7 @@ from sklearn_extra.kernel_approximation import Fastfood from sklearn_extra.kernel_methods import EigenProClassifier, EigenProRegressor -from sklearn_extra.cluster import KMedoids, CommonNNClustering +from sklearn_extra.cluster import KMedoids, CommonNNClustering, CLARA from sklearn_extra.robust import ( RobustWeightedClassifier, RobustWeightedRegressor, @@ -14,6 +14,7 @@ ALL_ESTIMATORS = [ Fastfood, KMedoids, + CLARA, EigenProClassifier, EigenProRegressor, CommonNNClustering, From 6a461d62b28842f613d3ec46dabc36b4278aade7 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Mon, 21 Dec 2020 11:03:07 +0100 Subject: [PATCH 07/16] add size check to pass tests --- sklearn_extra/cluster/_k_medoids.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index cf4b8854..eb24c87f 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -540,8 +540,7 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): >>> from sklearn_extra.cluster import CLARA >>> import numpy as np >>> from sklearn.datasets import make_blobs - >>> X, _ = make_blobs(centers=[[0,0],[1,1]],n_samples=100, n_features=2, - random_state=0) + >>> X, _ = make_blobs(centers=[[0,0],[1,1]], n_features=2,random_state=0) >>> clara = CLARA(n_clusters=2, random_state=0).fit(X) >>> clara.predict([[0,0], [4,4]]) array([0, 1]) @@ -622,21 +621,17 @@ def fit(self, X, y=None): "sample_size should be greater than self.n_clusters" ) + if self.n_clusters <= sampling_size: + raise ValueError( + "sampling size must be strictly greater than self.n_clustes" + ) + medoids_idxs = random_state_.choice( np.arange(n), size=self.n_clusters, replace=False ) best_score = np.inf for _ in range(self.samples): - sample_idxs = np.hstack( - [ - medoids_idxs, - random_state_.choice( - np.delete(np.arange(n), medoids_idxs), - size=sampling_size - self.n_clusters, - replace=False, - ), - ] - ) + pam = KMedoids( n_clusters=self.n_clusters, metric=self.metric, From 6e1cdedefb9ed50ff1e3c880192ed247bf9dc91d Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Mon, 21 Dec 2020 11:38:39 +0100 Subject: [PATCH 08/16] fix tests --- sklearn_extra/cluster/_k_medoids.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index eb24c87f..cd1779e4 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -606,24 +606,22 @@ def fit(self, X, y=None): random_state_ = check_random_state(self.random_state) if self.sampling_size is None: - sampling_size = min(n, 40 + 2 * self.n_clusters) + sampling_size = max( + min(n, 40 + 2 * self.n_clusters), self.n_clusters + 1 + ) else: sampling_size = self.sampling_size # Check sampling_size. - if n < sampling_size: - raise ValueError( - "sample_size should be greater than self.sampling_size" - ) if n < self.n_clusters: raise ValueError( "sample_size should be greater than self.n_clusters" ) - if self.n_clusters <= sampling_size: + if self.n_clusters >= sampling_size: raise ValueError( - "sampling size must be strictly greater than self.n_clustes" + "sampling size must be strictly greater than self.n_clusters" ) medoids_idxs = random_state_.choice( @@ -631,7 +629,19 @@ def fit(self, X, y=None): ) best_score = np.inf for _ in range(self.samples): - + if sampling_size >= n: + sample_idxs = np.arange(n) + else: + sample_idxs = np.hstack( + [ + medoids_idxs, + random_state_.choice( + np.delete(np.arange(n), medoids_idxs), + size=sampling_size - self.n_clusters, + replace=False, + ), + ] + ) pam = KMedoids( n_clusters=self.n_clusters, metric=self.metric, From a95c1c383fa782f32edf412ae23651655c0f1026 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Sun, 18 Apr 2021 14:21:32 +0200 Subject: [PATCH 09/16] update doc --- doc/modules/cluster.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/cluster.rst b/doc/modules/cluster.rst index 722f99e3..21d5329d 100644 --- a/doc/modules/cluster.rst +++ b/doc/modules/cluster.rst @@ -1,8 +1,8 @@ .. _cluster: -===================================================== -Clustering with KMedoids and Common-nearest-neighbors -===================================================== +============================================================ +Clustering with KMedoids, CLARA and Common-nearest-neighbors +============================================================ .. _k_medoids: K-Medoids @@ -89,7 +89,7 @@ CLARA :class:`CLARA` is related to the :class:`KMedoids` algorithm. CLARA (Clustering for Large Applications) extends k-medoids approach for a - large number of objects. This algorithm use a sampling approach. + large number of objects. This algorithm uses a sampling approach. .. topic:: Examples: From 107db44e598d68f8a03c7d94cc7219d2613ebb82 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Sat, 29 May 2021 11:05:16 +0200 Subject: [PATCH 10/16] add test consistency clara kmedoids --- sklearn_extra/cluster/tests/test_k_medoids.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index 857c7258..d35ad568 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -348,7 +348,7 @@ def test_kmedoids_on_sparse_input(): # Test the build initialization. def test_build(): X, y = fetch_20newsgroups_vectorized(return_X_y=True) - # Select only the first 1000 samples + # Select only the first 500 samples X = X[:500] y = y[:500] # Precompute cosine distance matrix @@ -358,3 +358,20 @@ def test_build(): ske.fit(diss) assert ske.inertia_ <= 230 assert len(np.unique(ske.labels_)) == 20 + + +def test_clara_consistency_iris(): + # test that CLARA is PAM when full sample is used + + rng = np.random.RandomState(seed) + X_iris = load_iris()["data"] + + clara = CLARA( + n_clusters=3, samples=1, sampling_size=len(X_iris), random_state=rng + ) + + model = KMedoids(n_clusters=3, init="build", random_state=rng) + + model.fit(X_iris) + clara.fit(X_iris) + assert np.sum(model.labels_ == clara.labels_) == len(X_iris) From e7b88cd15955714a53eb2a3c28efb809b05176d9 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Sat, 29 May 2021 11:12:00 +0200 Subject: [PATCH 11/16] black --- sklearn_extra/cluster/_k_medoids.py | 4 ++-- sklearn_extra/kernel_approximation/_fastfood.py | 4 ++-- sklearn_extra/robust/robust_weighted_estimator.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 3f0484a5..9a2baac6 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -152,7 +152,7 @@ def _check_nonnegative_int(self, value, desc, strict=True): ) def _check_init_args(self): - """Validates the input arguments. """ + """Validates the input arguments.""" # Check n_clusters and max_iter self._check_nonnegative_int(self.n_clusters, "n_clusters") @@ -307,7 +307,7 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs): medoid_idxs[k] = cluster_k_idxs[min_cost_idx] def _compute_cost(self, D, medoid_idxs): - """ Compute the cose for a given configuration of the medoids""" + """Compute the cose for a given configuration of the medoids""" return self._compute_inertia(D[:, medoid_idxs]) def transform(self, X): diff --git a/sklearn_extra/kernel_approximation/_fastfood.py b/sklearn_extra/kernel_approximation/_fastfood.py index e30d042f..715dfc4a 100644 --- a/sklearn_extra/kernel_approximation/_fastfood.py +++ b/sklearn_extra/kernel_approximation/_fastfood.py @@ -116,7 +116,7 @@ def _uniform_vector(self, rng): return None def _apply_approximate_gaussian_matrix(self, B, G, P, X): - """ Create mapping of all x_i by applying B, G and P step-wise """ + """Create mapping of all x_i by applying B, G and P step-wise""" num_examples = X.shape[0] result = np.multiply(B, X.reshape((1, num_examples, 1, self._d))) @@ -134,7 +134,7 @@ def _apply_approximate_gaussian_matrix(self, B, G, P, X): return result def _scale_transformed_data(self, S, VX): - """ Scale mapped data VX to match kernel(e.g. RBF-Kernel) """ + """Scale mapped data VX to match kernel(e.g. RBF-Kernel)""" VX = VX.reshape(-1, self._times_to_stack_v * self._d) return ( diff --git a/sklearn_extra/robust/robust_weighted_estimator.py b/sklearn_extra/robust/robust_weighted_estimator.py index 2e947550..1e74f963 100644 --- a/sklearn_extra/robust/robust_weighted_estimator.py +++ b/sklearn_extra/robust/robust_weighted_estimator.py @@ -418,7 +418,7 @@ def fit(self, X, y=None): return self def _get_loss_function(self, loss): - """Get concrete ''LossFunction'' object for str ''loss''. """ + """Get concrete ''LossFunction'' object for str ''loss''.""" if type(loss) == str: eff_loss = LOSS_FUNCTIONS.get(loss) if eff_loss is None: From fd3fa7242d79dc84038920fa66aacc70208138d8 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Fri, 18 Jun 2021 10:49:57 +0200 Subject: [PATCH 12/16] handle types KMedoids --- sklearn_extra/cluster/_k_medoids.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 9a2baac6..528104d6 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -194,6 +194,7 @@ def fit(self, X, y=None): ) D = pairwise_distances(X, metric=self.metric) + medoid_idxs = self._initialize_medoids( D, self.n_clusters, random_state_ ) @@ -202,6 +203,11 @@ def fit(self, X, y=None): if self.method == "pam": # Compute the distance to the first and second closest points # among medoids. + if (X.dtype is np.dtype(np.float32)) or ( + X.dtype is np.dtype(np.float16) + ): + D = D.astype(np.float32) + if self.n_clusters == 1 and self.max_iter > 0: # PAM SWAP step can only be used for n_clusters > 1 warnings.warn( @@ -211,6 +217,8 @@ def fit(self, X, y=None): self.max_iter = 0 elif self.max_iter > 0: Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]] + elif self.init != "build": + D = D.astype(X.dtype) # Continue the algorithm as long as # the medoids keep changing and the maximum number From 06d7650e48ff59560da2783e9858856f8fb615c3 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu <30346931+TimotheeMathieu@users.noreply.github.com> Date: Thu, 24 Jun 2021 19:11:55 +0200 Subject: [PATCH 13/16] Apply suggestions from code review Co-authored-by: Roman Yurchak --- doc/modules/cluster.rst | 6 +++--- sklearn_extra/cluster/_k_medoids.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/modules/cluster.rst b/doc/modules/cluster.rst index 21d5329d..5bf9e259 100644 --- a/doc/modules/cluster.rst +++ b/doc/modules/cluster.rst @@ -88,8 +88,8 @@ CLARA ===== :class:`CLARA` is related to the :class:`KMedoids` algorithm. CLARA - (Clustering for Large Applications) extends k-medoids approach for a - large number of objects. This algorithm uses a sampling approach. + (Clustering for Large Applications) extends k-medoids to be more scalable, + uses a sampling approach. .. topic:: Examples: @@ -98,7 +98,7 @@ CLARA **Algorithm description:** - CLARA use `sample` random samples of the dataset, each of size `sampling_size` + CLARA uses random samples of the dataset, each of size `sampling_size` The algorith is iterative, first we select one sub-sample, then CLARA applies KMedoids on this sub-sample to obtain `n_clusters` medoids. At the next step, CLARA sample `sampling_size`-`n_clusters` from the dataset and the next sub-sample diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 528104d6..9f761ac6 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -724,7 +724,7 @@ def transform(self, X): X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters) X transformed in the new space of distances to cluster centers. """ - X = check_array(X, accept_sparse=["csr", "csc"]) + X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32]) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") @@ -749,7 +749,7 @@ def predict(self, X): labels : array, shape = (n_query,) Index of the cluster each sample belongs to. """ - X = check_array(X, accept_sparse=["csr", "csc"]) + X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32]) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") From ed91aebe2a147cffa6fb8f08ad3088cd3be4c706 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Thu, 24 Jun 2021 20:19:06 +0200 Subject: [PATCH 14/16] correct 32 bit --- sklearn_extra/cluster/_k_medoids.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 9f761ac6..528104d6 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -724,7 +724,7 @@ def transform(self, X): X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters) X transformed in the new space of distances to cluster centers. """ - X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32]) + X = check_array(X, accept_sparse=["csr", "csc"]) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") @@ -749,7 +749,7 @@ def predict(self, X): labels : array, shape = (n_query,) Index of the cluster each sample belongs to. """ - X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32]) + X = check_array(X, accept_sparse=["csr", "csc"]) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") From b5fcfdb076e8777f203cd05872bc4859a3912201 Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Fri, 25 Jun 2021 09:13:29 +0200 Subject: [PATCH 15/16] change name variables --- examples/plot_clara_digits.py | 4 +- sklearn_extra/cluster/_k_medoids.py | 50 +++++++++---------- sklearn_extra/cluster/tests/test_k_medoids.py | 5 +- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/examples/plot_clara_digits.py b/examples/plot_clara_digits.py index 9afcb7c8..e1bb1f54 100644 --- a/examples/plot_clara_digits.py +++ b/examples/plot_clara_digits.py @@ -60,7 +60,7 @@ metric="cosine", n_clusters=n_digits, init="heuristic", - sampling_size=50, + n_sampling=50, ), "CLARA (cosine)", ), @@ -69,7 +69,7 @@ metric="manhattan", n_clusters=n_digits, init="heuristic", - sampling_size=50, + n_sampling=50, ), "CLARA (manhattan)", ), diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index d672e33a..128b52bf 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -205,10 +205,6 @@ def fit(self, X, y=None): if self.method == "pam": # Compute the distance to the first and second closest points # among medoids. - if (X.dtype is np.dtype(np.float32)) or ( - X.dtype is np.dtype(np.float16) - ): - D = D.astype(np.float32) if self.n_clusters == 1 and self.max_iter > 0: # PAM SWAP step can only be used for n_clusters > 1 @@ -219,8 +215,6 @@ def fit(self, X, y=None): self.max_iter = 0 elif self.max_iter > 0: Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]] - elif self.init != "build": - D = D.astype(X.dtype) # Continue the algorithm as long as # the medoids keep changing and the maximum number @@ -538,13 +532,13 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): Specify the maximum number of iterations when fitting PAM. It can be zero in which case only the initialization is computed. - sampling_size : int or None, optional, default : None + n_sampling : int or None, optional, default : None Size of the sampled dataset at each iteration. sampling-size a trade-off between complexity and efficiency. If None, then sampling-size is set to min(sample_size, 40 + 2 * self.n_clusters) as suggested by the authors of the algorithm. must be smaller than sample_size. - samples : int, optional, default : 5 + n_sampling_iter : int, optional, default : 5 Number of different samples that have to be done, or number of iterations. random_state : int, RandomState instance or None, optional @@ -594,7 +588,7 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin): Notes ----- Contrary to KMedoids, CLARA is linear in N the sample size for both the spacial - and time complexity. On the other hand, it scales quadratically with sampling_size. + and time complexity. On the other hand, it scales quadratically with n_sampling. """ @@ -604,16 +598,16 @@ def __init__( metric="euclidean", init="build", max_iter=300, - sampling_size=None, - samples=5, + n_sampling=None, + n_sampling_iter=5, random_state=None, ): self.n_clusters = n_clusters self.metric = metric self.init = init self.max_iter = max_iter - self.sampling_size = sampling_size - self.samples = samples + self.n_sampling = n_sampling + self.n_sampling_iter = n_sampling_iter self.random_state = random_state def fit(self, X, y=None): @@ -622,7 +616,7 @@ def fit(self, X, y=None): Parameters ---------- X : array-like, shape = (n_samples, n_features), \ - or (n_samples, n_samples) if metric == 'precomputed' + or (n_n_sampling_iter, n_n_sampling_iter) if metric == 'precomputed' Dataset to cluster. y : Ignored @@ -631,26 +625,26 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X) + X = check_array(X, dtype=[np.float64, np.float32]) n = len(X) random_state_ = check_random_state(self.random_state) - if self.sampling_size is None: - sampling_size = max( + if self.n_sampling is None: + n_sampling = max( min(n, 40 + 2 * self.n_clusters), self.n_clusters + 1 ) else: - sampling_size = self.sampling_size + n_sampling = self.n_sampling - # Check sampling_size. + # Check n_sampling. if n < self.n_clusters: raise ValueError( "sample_size should be greater than self.n_clusters" ) - if self.n_clusters >= sampling_size: + if self.n_clusters >= n_sampling: raise ValueError( "sampling size must be strictly greater than self.n_clusters" ) @@ -659,8 +653,8 @@ def fit(self, X, y=None): np.arange(n), size=self.n_clusters, replace=False ) best_score = np.inf - for _ in range(self.samples): - if sampling_size >= n: + for _ in range(self.n_sampling_iter): + if n_sampling >= n: sample_idxs = np.arange(n) else: sample_idxs = np.hstack( @@ -668,7 +662,7 @@ def fit(self, X, y=None): medoids_idxs, random_state_.choice( np.delete(np.arange(n), medoids_idxs), - size=sampling_size - self.n_clusters, + size=n_sampling - self.n_clusters, replace=False, ), ] @@ -692,7 +686,7 @@ def fit(self, X, y=None): self.medoid_indices_ = medoids_idxs self.labels_ = np.argmin(self.transform(X), axis=1) - self.n_iter_ = self.samples + self.n_iter_ = self.n_sampling_iter return self @@ -730,7 +724,9 @@ def transform(self, X): X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters) X transformed in the new space of distances to cluster centers. """ - X = check_array(X, accept_sparse=["csr", "csc"]) + X = check_array( + X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32] + ) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") @@ -755,7 +751,9 @@ def predict(self, X): labels : array, shape = (n_query,) Index of the cluster each sample belongs to. """ - X = check_array(X, accept_sparse=["csr", "csc"]) + X = check_array( + X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32] + ) if self.metric == "precomputed": check_is_fitted(self, "medoid_indices_") diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py index 8646b990..89742f3d 100644 --- a/sklearn_extra/cluster/tests/test_k_medoids.py +++ b/sklearn_extra/cluster/tests/test_k_medoids.py @@ -370,7 +370,10 @@ def test_clara_consistency_iris(): X_iris = load_iris()["data"] clara = CLARA( - n_clusters=3, samples=1, sampling_size=len(X_iris), random_state=rng + n_clusters=3, + n_sampling_iter=1, + n_sampling=len(X_iris), + random_state=rng, ) model = KMedoids(n_clusters=3, init="build", random_state=rng) From 30652cfe1887e8bc765dc11dcdcbf0d71ccbea0e Mon Sep 17 00:00:00 2001 From: TimotheeMathieu Date: Fri, 25 Jun 2021 09:21:33 +0200 Subject: [PATCH 16/16] create private function inertia and changelog --- doc/changelog.rst | 3 ++ sklearn_extra/cluster/_k_medoids.py | 67 +++++++++++------------------ 2 files changed, 27 insertions(+), 43 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 36aa2ecc..053b9197 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -4,6 +4,9 @@ Changelog Unreleased ---------- +- Add `CLARA` (Clustering for Large Applications) which extends k-medoids to + be more scalable using a sampling approach. + [`#83 `_]. - Fix `_estimator_type` for :class:`~sklearn_extra.robust` estimators. Fix misbehavior of scikit-learn's :class:`~sklearn.model_selection.cross_val_score` and :class:`~sklearn.grid_search.GridSearchCV` for :class:`~sklearn_extra.robust.RobustWeightedClassifier` diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py index 128b52bf..cccd575c 100644 --- a/sklearn_extra/cluster/_k_medoids.py +++ b/sklearn_extra/cluster/_k_medoids.py @@ -24,6 +24,27 @@ from ._k_medoids_helper import _compute_optimal_swap, _build +def _compute_inertia(distances): + """Compute inertia of new samples. Inertia is defined as the sum of the + sample distances to closest cluster centers. + + Parameters + ---------- + distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters) + Distances to cluster centers. + + Returns + ------- + Sum of sample distances to closest cluster centers. + """ + + # Define inertia as the sum of the sample-distances + # to closest cluster centers + inertia = np.sum(np.min(distances, axis=1)) + + return inertia + + class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin): """k-medoids clustering. @@ -270,7 +291,7 @@ def fit(self, X, y=None): # the training data to clusters self.labels_ = np.argmin(D[medoid_idxs, :], axis=0) self.medoid_indices_ = medoid_idxs - self.inertia_ = self._compute_inertia(self.transform(X)) + self.inertia_ = _compute_inertia(self.transform(X)) # Return self to enable method chaining return self @@ -312,7 +333,7 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs): def _compute_cost(self, D, medoid_idxs): """Compute the cose for a given configuration of the medoids""" - return self._compute_inertia(D[:, medoid_idxs]) + return _compute_inertia(D[:, medoid_idxs]) def transform(self, X): """Transforms X to cluster-distance space. @@ -386,26 +407,6 @@ def predict(self, X): return pd_argmin - def _compute_inertia(self, distances): - """Compute inertia of new samples. Inertia is defined as the sum of the - sample distances to closest cluster centers. - - Parameters - ---------- - distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters) - Distances to cluster centers. - - Returns - ------- - Sum of sample distances to closest cluster centers. - """ - - # Define inertia as the sum of the sample-distances - # to closest cluster centers - inertia = np.sum(np.min(distances, axis=1)) - - return inertia - def _initialize_medoids(self, D, n_clusters, random_state_): """Select initial mediods when beginning clustering.""" @@ -677,7 +678,7 @@ def fit(self, X, y=None): ) pam.fit(X[sample_idxs]) self.cluster_centers_ = pam.cluster_centers_ - self.inertia_ = self._compute_inertia(self.transform(X)) + self.inertia_ = _compute_inertia(self.transform(X)) if pam.inertia_ < best_score: best_score = self.inertia_ @@ -690,26 +691,6 @@ def fit(self, X, y=None): return self - def _compute_inertia(self, distances): - """Compute inertia of new samples. Inertia is defined as the sum of the - sample distances to closest cluster centers. - - Parameters - ---------- - distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters) - Distances to cluster centers. - - Returns - ------- - Sum of sample distances to closest cluster centers. - """ - - # Define inertia as the sum of the sample-distances - # to closest cluster centers - inertia = np.sum(np.min(distances, axis=1)) - - return inertia - def transform(self, X): """Transforms X to cluster-distance space.