From 94e14c4cd5abce50959174b794b30bd7d4488cae Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Sun, 29 Nov 2020 17:44:55 +0100
Subject: [PATCH 01/16] add CLARA

---
 sklearn_extra/cluster/__init__.py             |   4 +-
 sklearn_extra/cluster/_k_medoids.py           | 220 +++++++++++++++++-
 sklearn_extra/cluster/tests/test_k_medoids.py |  13 +-
 3 files changed, 232 insertions(+), 5 deletions(-)
diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py
index 0d4cf43c..426f8b99 100644
--- a/sklearn_extra/cluster/__init__.py
+++ b/sklearn_extra/cluster/__init__.py
@@ -1,4 +1,4 @@
-from ._k_medoids import KMedoids
+from ._k_medoids import KMedoids, CLARA
 from ._commonnn import commonnn, CommonNNClustering
 
-__all__ = ["KMedoids", "CommonNNClustering", "commonnn"]
+__all__ = ["KMedoids", "CLARA", "CommonNNClustering", "commonnn"]
diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 18fb987d..ea5adc18 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -39,9 +39,9 @@ class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
         What distance metric to use. See :func:metrics.pairwise_distances
 
     method : {'alternate', 'pam'}, default: 'alternate'
-        Which algorithm to use.
+        Which algorithm to use. 'pam' can be more efficient but slower.
 
-    init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'build'
+    init : {'random', 'heuristic', 'k-medoids++', 'build'}, optional, default: 'heuristic'
         Specify medoid initialization method. 'random' selects n_clusters
         elements from the dataset. 'heuristic' picks the n_clusters points
         with the smallest sum distance to every other point. 'k-medoids++'
@@ -229,6 +229,7 @@ def fit(self, X, y=None):
 
                     # update Djs and Ejs with new medoids
                     Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
+
             else:
                 raise ValueError(
                     f"method={self.method} is not supported. Supported methods "
@@ -476,3 +477,218 @@ def _kpp_init(self, D, n_clusters, random_state_, n_local_trials=None):
             closest_dist_sq = best_dist_sq
 
         return centers
+
+
+class CLARA(BaseEstimator, ClusterMixin, TransformerMixin):
+    """CLARA clustering.
+
+    Read more in the :ref:`User Guide <CLARA>`.
+    CLARA (Clustering for Large Applications) extends k-medoids approach for a
+    large number of objects. This algorithm use a sampling approach.
+
+    Parameters
+    ----------
+    n_clusters : int, optional, default: 8
+        The number of clusters to form as well as the number of medoids to
+        generate.
+
+    metric : string, or callable, optional, default: 'euclidean'
+        What distance metric to use. See :func:metrics.pairwise_distances
+
+    max_iter : int, optional, default : 300
+        Specify the maximum number of iterations when fitting PAM. It can be zero in
+        which case only the initialization is computed.
+
+    random_state : int, RandomState instance or None, optional
+        Specify random state for the random number generator. Used to
+        initialise medoids when init='random'.
+
+    Attributes
+    ----------
+    cluster_centers_ : array, shape = (n_clusters, n_features)
+            or None if metric == 'precomputed'
+        Cluster centers, i.e. medoids (elements from the original dataset)
+
+    medoid_indices_ : array, shape = (n_clusters,)
+        The indices of the medoid rows in X
+
+    labels_ : array, shape = (n_samples,)
+        Labels of each point
+
+    inertia_ : float
+        Sum of distances of samples to their closest cluster center.
+
+    Examples
+    --------
+    >>> from sklearn_extra.cluster import KMedoids
+    >>> import numpy as np
+
+    >>> X = np.asarray([[1, 2], [1, 4], [1, 0],
+    ...                 [4, 2], [4, 4], [4, 0]])
+    >>> clara = CLARA(n_clusters=2, random_state=0).fit(X)
+    >>> clara.labels_
+    array([0, 0, 0, 1, 1, 1])
+    >>> clara.predict([[0,0], [4,4]])
+    array([0, 1])
+    >>> clara.cluster_centers_
+    array([[1, 2],
+           [4, 2]])
+    >>> clara.inertia_
+    8.0
+
+
+    References
+    ----------
+        Kaufman, L. and Rousseeuw, P.J. (2008). Partitioning Around Medoids (Program PAM).
+        In Finding Groups in Data (eds L. Kaufman and P.J. Rousseeuw).
+        doi:10.1002/9780470316801.ch2
+
+    See also
+    --------
+
+    KMedoids
+        CLARA is a variant of KMedoids that use sub-sampling scheme as such if the
+        dataset is sufficiently small, KMedoids is preferable.
+
+    Notes
+    -----
+    Contrary to KMedoids, CLARA is linear in N the sample size for both the spacial
+    and time complexity. On the other hand, it scales quadratically with sampling_size.
+
+    """
+
+    def __init__(
+        self,
+        n_clusters=8,
+        metric="euclidean",
+        init="build",
+        max_iter=300,
+        sampling_size=None,
+        samples=5,
+        random_state=None,
+    ):
+        self.n_clusters = n_clusters
+        self.metric = metric
+        self.init = init
+        self.max_iter = max_iter
+        self.sampling_size = sampling_size
+        self.samples = samples
+        self.random_state = random_state
+
+    def fit(self, X, y=None):
+        n = len(X)
+
+        if self.sampling_size is None:
+            sampling_size = min(n, 40 + 2 * self.n_clusters)
+        else:
+            sampling_size = self.sampling_size
+        rng = np.random.RandomState(self.random_state)
+        medoids_idxs = rng.choice(
+            np.arange(n), size=self.n_clusters, replace=False
+        )
+        best_score = np.inf
+        for _ in range(self.samples):
+            sample_idxs = np.hstack(
+                [
+                    medoids_idxs,
+                    rng.choice(
+                        np.delete(np.arange(n), medoids_idxs),
+                        size=sampling_size - self.n_clusters,
+                        replace=False,
+                    ),
+                ]
+            )
+            pam = KMedoids(
+                n_clusters=self.n_clusters,
+                metric=self.metric,
+                method="pam",
+                init=self.init,
+                max_iter=self.max_iter,
+                random_state=rng,
+            )
+            pam.fit(X[sample_idxs])
+            self.cluster_centers_ = pam.cluster_centers_
+            self.inertia_ = self._compute_inertia(self.transform(X))
+
+            if pam.inertia_ < best_score:
+                best_score = self.inertia_
+                medoids_idxs = pam.medoid_indices_
+                best_sample_idxs = sample_idxs
+
+        self.medoid_indices_ = medoids_idxs
+        self.labels_ = np.argmin(self.transform(X), axis=0)
+
+        return self
+
+    def _compute_inertia(self, distances):
+        """Compute inertia of new samples. Inertia is defined as the sum of the
+        sample distances to closest cluster centers.
+
+        Parameters
+        ----------
+        distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
+            Distances to cluster centers.
+
+        Returns
+        -------
+        Sum of sample distances to closest cluster centers.
+        """
+
+        # Define inertia as the sum of the sample-distances
+        # to closest cluster centers
+        inertia = np.sum(np.min(distances, axis=1))
+
+        return inertia
+
+    def transform(self, X):
+        """Transforms X to cluster-distance space.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_query, n_features), \
+                or (n_query, n_indexed) if metric == 'precomputed'
+            Data to transform.
+
+        Returns
+        -------
+        X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters)
+            X transformed in the new space of distances to cluster centers.
+        """
+        X = check_array(X, accept_sparse=["csr", "csc"])
+
+        if self.metric == "precomputed":
+            check_is_fitted(self, "medoid_indices_")
+            return X[:, self.medoid_indices_]
+        else:
+            check_is_fitted(self, "cluster_centers_")
+
+            Y = self.cluster_centers_
+            return pairwise_distances(X, Y=Y, metric=self.metric)
+
+    def predict(self, X):
+        """Predict the closest cluster for each sample in X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_query, n_features), \
+                or (n_query, n_indexed) if metric == 'precomputed'
+            New data to predict.
+
+        Returns
+        -------
+        labels : array, shape = (n_query,)
+            Index of the cluster each sample belongs to.
+        """
+        X = check_array(X, accept_sparse=["csr", "csc"])
+
+        if self.metric == "precomputed":
+            check_is_fitted(self, "medoid_indices_")
+            return np.argmin(X[:, self.medoid_indices_], axis=1)
+        else:
+            check_is_fitted(self, "cluster_centers_")
+
+            # Return data points to clusters based on which cluster assignment
+            # yields the smallest distance
+            return pairwise_distances_argmin(
+                X, Y=self.cluster_centers_, metric=self.metric
+            )
diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py
index f6854ee5..eaaf0106 100644
--- a/sklearn_extra/cluster/tests/test_k_medoids.py
+++ b/sklearn_extra/cluster/tests/test_k_medoids.py
@@ -10,7 +10,7 @@
 from sklearn.metrics.pairwise import euclidean_distances
 from numpy.testing import assert_allclose, assert_array_equal
 
-from sklearn_extra.cluster import KMedoids
+from sklearn_extra.cluster import KMedoids, CLARA
 from sklearn.cluster import KMeans
 from sklearn.datasets import make_blobs
 
@@ -43,6 +43,17 @@ def test_kmedoid_results(method, init):
     )
 
 
+def test_clara_results():
+    expected = np.hstack([np.zeros(50), np.ones(50)])
+    km = CLARA(n_clusters=2)
+    km.fit(X_cc)
+    # This test use data that are not perfectly separable so the
+    # accuracy is not 1. Accuracy around 0.85
+    assert (np.mean(km.labels_ == expected) > 0.8) or (
+        1 - np.mean(km.labels_ == expected) > 0.8
+    )
+
+
 def test_medoids_invalid_method():
     with pytest.raises(ValueError, match="invalid is not supported"):
         KMedoids(n_clusters=1, method="invalid").fit([[0, 1], [1, 1]])

From 51d4034e8945adc10d014c66df4dff673bebe9ea Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Mon, 30 Nov 2020 11:28:33 +0100
Subject: [PATCH 02/16] add example

---
 examples/plot_clara_digits.py | 121 ++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 examples/plot_clara_digits.py

diff --git a/examples/plot_clara_digits.py b/examples/plot_clara_digits.py
new file mode 100644
index 00000000..9eb63275
--- /dev/null
+++ b/examples/plot_clara_digits.py
@@ -0,0 +1,121 @@
+"""
+======================================================================
+A demo of K-Medoids vs CLARA clustering on the handwritten digits data
+======================================================================
+In this example we compare different computation time of K-Medoids and CLARA on
+the handwritten digits data.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+
+from sklearn_extra.cluster import KMedoids, CLARA
+from sklearn.datasets import load_digits
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import scale
+
+print(__doc__)
+
+# Authors: Timo Erkkilä <timo.erkkila@gmail.com>
+#          Antti Lehmussola <antti.lehmussola@gmail.com>
+#          Kornel Kiełczewski <kornel.mail@gmail.com>
+# License: BSD 3 clause
+
+np.random.seed(42)
+
+digits = load_digits()
+data = scale(digits.data)
+n_digits = len(np.unique(digits.target))
+
+reduced_data = PCA(n_components=2).fit_transform(data)
+
+# Step size of the mesh. Decrease to increase the quality of the VQ.
+h = 0.02  # point in the mesh [x_min, m_max]x[y_min, y_max].
+
+# Plot the decision boundary. For that, we will assign a color to each
+x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
+y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+plt.figure()
+plt.clf()
+
+plt.suptitle(
+    "Comparing multiple K-Medoids metrics to K-Means and each other",
+    fontsize=14,
+)
+
+
+selected_models = [
+    (
+        KMedoids(metric="cosine", n_clusters=n_digits),
+        "KMedoids (cosine)",
+    ),
+    (
+        KMedoids(metric="manhattan", n_clusters=n_digits),
+        "KMedoids (manhattan)",
+    ),
+    (
+        CLARA(
+            metric="cosine",
+            n_clusters=n_digits,
+            init="heuristic",
+            sampling_size=50,
+        ),
+        "CLARA (cosine)",
+    ),
+    (
+        CLARA(
+            metric="manhattan",
+            n_clusters=n_digits,
+            init="heuristic",
+            sampling_size=50,
+        ),
+        "CLARA (manhattan)",
+    ),
+]
+
+plot_rows = int(np.ceil(len(selected_models) / 2.0))
+plot_cols = 2
+
+for i, (model, description) in enumerate(selected_models):
+
+    # Obtain labels for each point in mesh. Use last trained model.
+    init_time = time.time()
+    model.fit(reduced_data)
+    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
+    computation_time = time.time() - init_time
+
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    plt.subplot(plot_cols, plot_rows, i + 1)
+    plt.imshow(
+        Z,
+        interpolation="nearest",
+        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+        cmap=plt.cm.Paired,
+        aspect="auto",
+        origin="lower",
+    )
+
+    plt.plot(
+        reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2, alpha=0.3
+    )
+    # Plot the centroids as a white X
+    centroids = model.cluster_centers_
+    plt.scatter(
+        centroids[:, 0],
+        centroids[:, 1],
+        marker="x",
+        s=169,
+        linewidths=3,
+        color="w",
+        zorder=10,
+    )
+    plt.title(description + ": %.2Fs" % (computation_time))
+    plt.xlim(x_min, x_max)
+    plt.ylim(y_min, y_max)
+    plt.xticks(())
+    plt.yticks(())
+
+plt.show()

From 6204cba82fafef12b0cd2893b7c87a19b78ee44f Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Mon, 30 Nov 2020 11:39:17 +0100
Subject: [PATCH 03/16] fix typo

---
 examples/plot_clara_digits.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/plot_clara_digits.py b/examples/plot_clara_digits.py
index 9eb63275..9afcb7c8 100644
--- a/examples/plot_clara_digits.py
+++ b/examples/plot_clara_digits.py
@@ -41,7 +41,7 @@
 plt.clf()
 
 plt.suptitle(
-    "Comparing multiple K-Medoids metrics to K-Means and each other",
+    "Comparing KMedoids and CLARA",
     fontsize=14,
 )
 

From 61a580c5b028304b80f2f55b07548c71fbaf2d35 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 18 Dec 2020 19:06:37 +0100
Subject: [PATCH 04/16] add doc

---
 doc/api.rst             |  1 +
 doc/modules/cluster.rst | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/doc/api.rst b/doc/api.rst
index 57b36246..25fc8ed8 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -32,6 +32,7 @@ Clustering
 
    cluster.KMedoids
    cluster.CommonNNClustering
+   cluster.CLARA
 
 Robust
 ====================
diff --git a/doc/modules/cluster.rst b/doc/modules/cluster.rst
index bb351308..d987fb3e 100644
--- a/doc/modules/cluster.rst
+++ b/doc/modules/cluster.rst
@@ -80,6 +80,38 @@ when speed is an issue.
     for performing face recognition. International Journal of Soft Computing,
     Mathematics and Control, 3(3), pp 1-12.
 
+
+
+CLARA
+=====
+
+    :class:`CLARA` is related to the :class:`KMedoids` algorithm. CLARA
+    (Clustering for Large Applications) extends k-medoids approach for a
+    large number of objects. This algorithm use a sampling approach.
+
+    .. topic:: Examples:
+
+      * :ref:`sphx_glr_auto_examples_plot_clara_digits.py`: Applying K-Medoids on digits
+        with various distance metrics.
+
+
+    **Algorithm description:**
+    CLARA use `sample` random samples of the dataset, each of size `sampling_size`
+    The algorith is iterative, first we select one sub-sample, then CLARA applies
+    KMedoids on this sub-sample to obtain `n_clusters` medoids. At the next step,
+    CLARA sample `sampling_size`-`n_clusters` from the dataset and the next sub-sample
+    is composed of the best medoids found until now (with respect to inertia in the
+    whole dataset, not the inertia only on the sub-sample) to which we add the new
+    samples just drawn. Then, K-Medoids is applied to this new sub-sample, and loop
+    back until `sample` sub-samples have been used.
+
+
+    .. topic:: References:
+
+      * Kaufman, L. and Rousseeuw, P.J. (2008). Clustering Large Applications (Program CLARA).
+        In Finding Groups in Data (eds L. Kaufman and P.J. Rousseeuw).
+        doi:10.1002/9780470316801.ch2
+
 .. _commonnn:
 
 Common-nearest-neighbors clustering

From ebabc8a39a0811b434016a2aad9695fce2823b9b Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 18 Dec 2020 19:06:48 +0100
Subject: [PATCH 05/16] fix docstring

---
 sklearn_extra/cluster/_k_medoids.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 8abaa035..5c27bc69 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -520,22 +520,16 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin):
 
     Examples
     --------
-    >>> from sklearn_extra.cluster import KMedoids
+    >>> from sklearn_extra.cluster import CLARA
     >>> import numpy as np
-
-    >>> X = np.asarray([[1, 2], [1, 4], [1, 0],
-    ...                 [4, 2], [4, 4], [4, 0]])
+    >>> from sklearn.datasets import make_blobs
+    >>> X, _ = make_blobs(centers=[[0,0],[1,1]],n_samples=100, n_features=2,
+                          random_state=0)
     >>> clara = CLARA(n_clusters=2, random_state=0).fit(X)
-    >>> clara.labels_
-    array([0, 0, 0, 1, 1, 1])
     >>> clara.predict([[0,0], [4,4]])
     array([0, 1])
-    >>> clara.cluster_centers_
-    array([[1, 2],
-           [4, 2]])
     >>> clara.inertia_
-    8.0
-
+    122.44919397611667
 
     References
     ----------
@@ -616,7 +610,7 @@ def fit(self, X, y=None):
                 best_sample_idxs = sample_idxs
 
         self.medoid_indices_ = medoids_idxs
-        self.labels_ = np.argmin(self.transform(X), axis=0)
+        self.labels_ = np.argmin(self.transform(X), axis=1)
 
         return self
 

From 007d0e8e360d1fd565d20c189ccbc4aa41fbf1d5 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Mon, 21 Dec 2020 10:36:41 +0100
Subject: [PATCH 06/16] add CLARA to test_common

---
 sklearn_extra/cluster/_k_medoids.py | 60 +++++++++++++++++++++++++----
 sklearn_extra/tests/test_common.py  |  3 +-
 2 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 5c27bc69..cf4b8854 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -200,7 +200,15 @@ def fit(self, X, y=None):
         if self.method == "pam":
             # Compute the distance to the first and second closest points
             # among medoids.
-            Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
+            if self.n_clusters == 1 and self.max_iter > 0:
+                # PAM SWAP step can only be used for n_clusters > 1
+                warnings.warn(
+                    "n_clusters should be larger than 2 if max_iter != 0 "
+                    "setting max_iter to 0."
+                )
+                self.max_iter = 0
+            elif self.max_iter > 0:
+                Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
 
         # Continue the algorithm as long as
         # the medoids keep changing and the maximum number
@@ -496,8 +504,17 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin):
         What distance metric to use. See :func:metrics.pairwise_distances
 
     max_iter : int, optional, default : 300
-        Specify the maximum number of iterations when fitting PAM. It can be zero in
-        which case only the initialization is computed.
+        Specify the maximum number of iterations when fitting PAM. It can be zero
+        in which case only the initialization is computed.
+
+    sampling_size : int or None, optional, default : None
+        Size of the sampled dataset at each iteration. sampling-size a trade-off
+        between complexity and efficiency. If None, then sampling-size is set
+        to min(sample_size, 40 + 2 * self.n_clusters) as suggested by the authors of the
+        algorithm. must be smaller than sample_size.
+
+    samples : int, optional, default : 5
+        Number of different samples that have to be done, or number of iterations.
 
     random_state : int, RandomState instance or None, optional
         Specify random state for the random number generator. Used to
@@ -570,14 +587,42 @@ def __init__(
         self.random_state = random_state
 
     def fit(self, X, y=None):
+        """Fit CLARA to the provided data.
+
+        Parameters
+        ----------
+        X : array-like, shape = (n_samples, n_features), \
+                or (n_samples, n_samples) if metric == 'precomputed'
+            Dataset to cluster.
+
+        y : Ignored
+
+        Returns
+        -------
+        self
+        """
+        X = check_array(X)
         n = len(X)
 
+        random_state_ = check_random_state(self.random_state)
+
         if self.sampling_size is None:
             sampling_size = min(n, 40 + 2 * self.n_clusters)
         else:
             sampling_size = self.sampling_size
-        rng = np.random.RandomState(self.random_state)
-        medoids_idxs = rng.choice(
+
+        # Check sampling_size.
+        if n < sampling_size:
+            raise ValueError(
+                "sample_size should be greater than self.sampling_size"
+            )
+
+        if n < self.n_clusters:
+            raise ValueError(
+                "sample_size should be greater than self.n_clusters"
+            )
+
+        medoids_idxs = random_state_.choice(
             np.arange(n), size=self.n_clusters, replace=False
         )
         best_score = np.inf
@@ -585,7 +630,7 @@ def fit(self, X, y=None):
             sample_idxs = np.hstack(
                 [
                     medoids_idxs,
-                    rng.choice(
+                    random_state_.choice(
                         np.delete(np.arange(n), medoids_idxs),
                         size=sampling_size - self.n_clusters,
                         replace=False,
@@ -598,7 +643,7 @@ def fit(self, X, y=None):
                 method="pam",
                 init=self.init,
                 max_iter=self.max_iter,
-                random_state=rng,
+                random_state=random_state_,
             )
             pam.fit(X[sample_idxs])
             self.cluster_centers_ = pam.cluster_centers_
@@ -611,6 +656,7 @@ def fit(self, X, y=None):
 
         self.medoid_indices_ = medoids_idxs
         self.labels_ = np.argmin(self.transform(X), axis=1)
+        self.n_iter_ = self.samples
 
         return self
 
diff --git a/sklearn_extra/tests/test_common.py b/sklearn_extra/tests/test_common.py
index 2da6cf22..3a72dc32 100644
--- a/sklearn_extra/tests/test_common.py
+++ b/sklearn_extra/tests/test_common.py
@@ -3,7 +3,7 @@
 
 from sklearn_extra.kernel_approximation import Fastfood
 from sklearn_extra.kernel_methods import EigenProClassifier, EigenProRegressor
-from sklearn_extra.cluster import KMedoids, CommonNNClustering
+from sklearn_extra.cluster import KMedoids, CommonNNClustering, CLARA
 from sklearn_extra.robust import (
     RobustWeightedClassifier,
     RobustWeightedRegressor,
@@ -14,6 +14,7 @@
 ALL_ESTIMATORS = [
     Fastfood,
     KMedoids,
+    CLARA,
     EigenProClassifier,
     EigenProRegressor,
     CommonNNClustering,

From 6a461d62b28842f613d3ec46dabc36b4278aade7 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Mon, 21 Dec 2020 11:03:07 +0100
Subject: [PATCH 07/16] add size check to pass tests

---
 sklearn_extra/cluster/_k_medoids.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index cf4b8854..eb24c87f 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -540,8 +540,7 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin):
     >>> from sklearn_extra.cluster import CLARA
     >>> import numpy as np
     >>> from sklearn.datasets import make_blobs
-    >>> X, _ = make_blobs(centers=[[0,0],[1,1]],n_samples=100, n_features=2,
-                          random_state=0)
+    >>> X, _ = make_blobs(centers=[[0,0],[1,1]], n_features=2,random_state=0)
     >>> clara = CLARA(n_clusters=2, random_state=0).fit(X)
     >>> clara.predict([[0,0], [4,4]])
     array([0, 1])
@@ -622,21 +621,17 @@ def fit(self, X, y=None):
                 "sample_size should be greater than self.n_clusters"
             )
 
+        if self.n_clusters <= sampling_size:
+            raise ValueError(
+                "sampling size must be strictly greater than self.n_clustes"
+            )
+
         medoids_idxs = random_state_.choice(
             np.arange(n), size=self.n_clusters, replace=False
         )
         best_score = np.inf
         for _ in range(self.samples):
-            sample_idxs = np.hstack(
-                [
-                    medoids_idxs,
-                    random_state_.choice(
-                        np.delete(np.arange(n), medoids_idxs),
-                        size=sampling_size - self.n_clusters,
-                        replace=False,
-                    ),
-                ]
-            )
+
             pam = KMedoids(
                 n_clusters=self.n_clusters,
                 metric=self.metric,

From 6e1cdedefb9ed50ff1e3c880192ed247bf9dc91d Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Mon, 21 Dec 2020 11:38:39 +0100
Subject: [PATCH 08/16] fix tests

---
 sklearn_extra/cluster/_k_medoids.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index eb24c87f..cd1779e4 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -606,24 +606,22 @@ def fit(self, X, y=None):
         random_state_ = check_random_state(self.random_state)
 
         if self.sampling_size is None:
-            sampling_size = min(n, 40 + 2 * self.n_clusters)
+            sampling_size = max(
+                min(n, 40 + 2 * self.n_clusters), self.n_clusters + 1
+            )
         else:
             sampling_size = self.sampling_size
 
         # Check sampling_size.
-        if n < sampling_size:
-            raise ValueError(
-                "sample_size should be greater than self.sampling_size"
-            )
 
         if n < self.n_clusters:
             raise ValueError(
                 "sample_size should be greater than self.n_clusters"
             )
 
-        if self.n_clusters <= sampling_size:
+        if self.n_clusters >= sampling_size:
             raise ValueError(
-                "sampling size must be strictly greater than self.n_clustes"
+                "sampling size must be strictly greater than self.n_clusters"
             )
 
         medoids_idxs = random_state_.choice(
@@ -631,7 +629,19 @@ def fit(self, X, y=None):
         )
         best_score = np.inf
         for _ in range(self.samples):
-
+            if sampling_size >= n:
+                sample_idxs = np.arange(n)
+            else:
+                sample_idxs = np.hstack(
+                    [
+                        medoids_idxs,
+                        random_state_.choice(
+                            np.delete(np.arange(n), medoids_idxs),
+                            size=sampling_size - self.n_clusters,
+                            replace=False,
+                        ),
+                    ]
+                )
             pam = KMedoids(
                 n_clusters=self.n_clusters,
                 metric=self.metric,

From a95c1c383fa782f32edf412ae23651655c0f1026 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Sun, 18 Apr 2021 14:21:32 +0200
Subject: [PATCH 09/16] update doc

---
 doc/modules/cluster.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/cluster.rst b/doc/modules/cluster.rst
index 722f99e3..21d5329d 100644
--- a/doc/modules/cluster.rst
+++ b/doc/modules/cluster.rst
@@ -1,8 +1,8 @@
 .. _cluster:
 
-=====================================================
-Clustering with KMedoids and Common-nearest-neighbors
-=====================================================
+============================================================
+Clustering with KMedoids, CLARA and Common-nearest-neighbors
+============================================================
 .. _k_medoids:
 
 K-Medoids
@@ -89,7 +89,7 @@ CLARA
 
     :class:`CLARA` is related to the :class:`KMedoids` algorithm. CLARA
     (Clustering for Large Applications) extends k-medoids approach for a
-    large number of objects. This algorithm use a sampling approach.
+    large number of objects. This algorithm uses a sampling approach.
 
     .. topic:: Examples:
 

From 107db44e598d68f8a03c7d94cc7219d2613ebb82 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Sat, 29 May 2021 11:05:16 +0200
Subject: [PATCH 10/16] add test consistency clara kmedoids

---
 sklearn_extra/cluster/tests/test_k_medoids.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py
index 857c7258..d35ad568 100644
--- a/sklearn_extra/cluster/tests/test_k_medoids.py
+++ b/sklearn_extra/cluster/tests/test_k_medoids.py
@@ -348,7 +348,7 @@ def test_kmedoids_on_sparse_input():
 # Test the build initialization.
 def test_build():
     X, y = fetch_20newsgroups_vectorized(return_X_y=True)
-    # Select only the first 1000 samples
+    # Select only the first 500 samples
     X = X[:500]
     y = y[:500]
     # Precompute cosine distance matrix
@@ -358,3 +358,20 @@ def test_build():
     ske.fit(diss)
     assert ske.inertia_ <= 230
     assert len(np.unique(ske.labels_)) == 20
+
+
+def test_clara_consistency_iris():
+    # test that CLARA is PAM when full sample is used
+
+    rng = np.random.RandomState(seed)
+    X_iris = load_iris()["data"]
+
+    clara = CLARA(
+        n_clusters=3, samples=1, sampling_size=len(X_iris), random_state=rng
+    )
+
+    model = KMedoids(n_clusters=3, init="build", random_state=rng)
+
+    model.fit(X_iris)
+    clara.fit(X_iris)
+    assert np.sum(model.labels_ == clara.labels_) == len(X_iris)

From e7b88cd15955714a53eb2a3c28efb809b05176d9 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Sat, 29 May 2021 11:12:00 +0200
Subject: [PATCH 11/16] black

---
 sklearn_extra/cluster/_k_medoids.py               | 4 ++--
 sklearn_extra/kernel_approximation/_fastfood.py   | 4 ++--
 sklearn_extra/robust/robust_weighted_estimator.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 3f0484a5..9a2baac6 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -152,7 +152,7 @@ def _check_nonnegative_int(self, value, desc, strict=True):
             )
 
     def _check_init_args(self):
-        """Validates the input arguments. """
+        """Validates the input arguments."""
 
         # Check n_clusters and max_iter
         self._check_nonnegative_int(self.n_clusters, "n_clusters")
@@ -307,7 +307,7 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
                 medoid_idxs[k] = cluster_k_idxs[min_cost_idx]
 
     def _compute_cost(self, D, medoid_idxs):
-        """ Compute the cose for a given configuration of the medoids"""
+        """Compute the cose for a given configuration of the medoids"""
         return self._compute_inertia(D[:, medoid_idxs])
 
     def transform(self, X):
diff --git a/sklearn_extra/kernel_approximation/_fastfood.py b/sklearn_extra/kernel_approximation/_fastfood.py
index e30d042f..715dfc4a 100644
--- a/sklearn_extra/kernel_approximation/_fastfood.py
+++ b/sklearn_extra/kernel_approximation/_fastfood.py
@@ -116,7 +116,7 @@ def _uniform_vector(self, rng):
             return None
 
     def _apply_approximate_gaussian_matrix(self, B, G, P, X):
-        """ Create mapping of all x_i by applying B, G and P step-wise """
+        """Create mapping of all x_i by applying B, G and P step-wise"""
         num_examples = X.shape[0]
 
         result = np.multiply(B, X.reshape((1, num_examples, 1, self._d)))
@@ -134,7 +134,7 @@ def _apply_approximate_gaussian_matrix(self, B, G, P, X):
         return result
 
     def _scale_transformed_data(self, S, VX):
-        """ Scale mapped data VX to match kernel(e.g. RBF-Kernel) """
+        """Scale mapped data VX to match kernel(e.g. RBF-Kernel)"""
         VX = VX.reshape(-1, self._times_to_stack_v * self._d)
 
         return (
diff --git a/sklearn_extra/robust/robust_weighted_estimator.py b/sklearn_extra/robust/robust_weighted_estimator.py
index 2e947550..1e74f963 100644
--- a/sklearn_extra/robust/robust_weighted_estimator.py
+++ b/sklearn_extra/robust/robust_weighted_estimator.py
@@ -418,7 +418,7 @@ def fit(self, X, y=None):
         return self
 
     def _get_loss_function(self, loss):
-        """Get concrete ''LossFunction'' object for str ''loss''. """
+        """Get concrete ''LossFunction'' object for str ''loss''."""
         if type(loss) == str:
             eff_loss = LOSS_FUNCTIONS.get(loss)
             if eff_loss is None:

From fd3fa7242d79dc84038920fa66aacc70208138d8 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 18 Jun 2021 10:49:57 +0200
Subject: [PATCH 12/16] handle types KMedoids

---
 sklearn_extra/cluster/_k_medoids.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 9a2baac6..528104d6 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -194,6 +194,7 @@ def fit(self, X, y=None):
             )
 
         D = pairwise_distances(X, metric=self.metric)
+
         medoid_idxs = self._initialize_medoids(
             D, self.n_clusters, random_state_
         )
@@ -202,6 +203,11 @@ def fit(self, X, y=None):
         if self.method == "pam":
             # Compute the distance to the first and second closest points
             # among medoids.
+            if (X.dtype is np.dtype(np.float32)) or (
+                X.dtype is np.dtype(np.float16)
+            ):
+                D = D.astype(np.float32)
+
             if self.n_clusters == 1 and self.max_iter > 0:
                 # PAM SWAP step can only be used for n_clusters > 1
                 warnings.warn(
@@ -211,6 +217,8 @@ def fit(self, X, y=None):
                 self.max_iter = 0
             elif self.max_iter > 0:
                 Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
+        elif self.init != "build":
+            D = D.astype(X.dtype)
 
         # Continue the algorithm as long as
         # the medoids keep changing and the maximum number

From 06d7650e48ff59560da2783e9858856f8fb615c3 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <30346931+TimotheeMathieu@users.noreply.github.com>
Date: Thu, 24 Jun 2021 19:11:55 +0200
Subject: [PATCH 13/16] Apply suggestions from code review

Co-authored-by: Roman Yurchak <rth.yurchak@gmail.com>
---
 doc/modules/cluster.rst             | 6 +++---
 sklearn_extra/cluster/_k_medoids.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/modules/cluster.rst b/doc/modules/cluster.rst
index 21d5329d..5bf9e259 100644
--- a/doc/modules/cluster.rst
+++ b/doc/modules/cluster.rst
@@ -88,8 +88,8 @@ CLARA
 =====
 
     :class:`CLARA` is related to the :class:`KMedoids` algorithm. CLARA
-    (Clustering for Large Applications) extends k-medoids approach for a
-    large number of objects. This algorithm uses a sampling approach.
+    (Clustering for Large Applications) extends k-medoids to be more scalable,
+    uses a sampling approach.
 
     .. topic:: Examples:
 
@@ -98,7 +98,7 @@ CLARA
 
 
     **Algorithm description:**
-    CLARA use `sample` random samples of the dataset, each of size `sampling_size`
+    CLARA uses random samples of the dataset, each of size `sampling_size`
     The algorith is iterative, first we select one sub-sample, then CLARA applies
     KMedoids on this sub-sample to obtain `n_clusters` medoids. At the next step,
     CLARA sample `sampling_size`-`n_clusters` from the dataset and the next sub-sample
diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 528104d6..9f761ac6 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -724,7 +724,7 @@ def transform(self, X):
         X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters)
             X transformed in the new space of distances to cluster centers.
         """
-        X = check_array(X, accept_sparse=["csr", "csc"])
+        X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32])
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")
@@ -749,7 +749,7 @@ def predict(self, X):
         labels : array, shape = (n_query,)
             Index of the cluster each sample belongs to.
         """
-        X = check_array(X, accept_sparse=["csr", "csc"])
+        X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32])
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")

From ed91aebe2a147cffa6fb8f08ad3088cd3be4c706 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Thu, 24 Jun 2021 20:19:06 +0200
Subject: [PATCH 14/16] correct 32 bit

---
 sklearn_extra/cluster/_k_medoids.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 9f761ac6..528104d6 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -724,7 +724,7 @@ def transform(self, X):
         X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters)
             X transformed in the new space of distances to cluster centers.
         """
-        X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32])
+        X = check_array(X, accept_sparse=["csr", "csc"])
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")
@@ -749,7 +749,7 @@ def predict(self, X):
         labels : array, shape = (n_query,)
             Index of the cluster each sample belongs to.
         """
-        X = check_array(X, accept_sparse=["csr", "csc"], dtypes=[np.float64, np.float32])
+        X = check_array(X, accept_sparse=["csr", "csc"])
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")

From b5fcfdb076e8777f203cd05872bc4859a3912201 Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 25 Jun 2021 09:13:29 +0200
Subject: [PATCH 15/16] change name variables

---
 examples/plot_clara_digits.py                 |  4 +-
 sklearn_extra/cluster/_k_medoids.py           | 50 +++++++++----------
 sklearn_extra/cluster/tests/test_k_medoids.py |  5 +-
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/examples/plot_clara_digits.py b/examples/plot_clara_digits.py
index 9afcb7c8..e1bb1f54 100644
--- a/examples/plot_clara_digits.py
+++ b/examples/plot_clara_digits.py
@@ -60,7 +60,7 @@
             metric="cosine",
             n_clusters=n_digits,
             init="heuristic",
-            sampling_size=50,
+            n_sampling=50,
         ),
         "CLARA (cosine)",
     ),
@@ -69,7 +69,7 @@
             metric="manhattan",
             n_clusters=n_digits,
             init="heuristic",
-            sampling_size=50,
+            n_sampling=50,
         ),
         "CLARA (manhattan)",
     ),
diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index d672e33a..128b52bf 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -205,10 +205,6 @@ def fit(self, X, y=None):
         if self.method == "pam":
             # Compute the distance to the first and second closest points
             # among medoids.
-            if (X.dtype is np.dtype(np.float32)) or (
-                X.dtype is np.dtype(np.float16)
-            ):
-                D = D.astype(np.float32)
 
             if self.n_clusters == 1 and self.max_iter > 0:
                 # PAM SWAP step can only be used for n_clusters > 1
@@ -219,8 +215,6 @@ def fit(self, X, y=None):
                 self.max_iter = 0
             elif self.max_iter > 0:
                 Djs, Ejs = np.sort(D[medoid_idxs], axis=0)[[0, 1]]
-        elif self.init != "build":
-            D = D.astype(X.dtype)
 
         # Continue the algorithm as long as
         # the medoids keep changing and the maximum number
@@ -538,13 +532,13 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin):
         Specify the maximum number of iterations when fitting PAM. It can be zero
         in which case only the initialization is computed.
 
-    sampling_size : int or None, optional, default : None
+    n_sampling : int or None, optional, default : None
         Size of the sampled dataset at each iteration. sampling-size a trade-off
         between complexity and efficiency. If None, then sampling-size is set
         to min(sample_size, 40 + 2 * self.n_clusters) as suggested by the authors of the
         algorithm. must be smaller than sample_size.
 
-    samples : int, optional, default : 5
+    n_sampling_iter : int, optional, default : 5
         Number of different samples that have to be done, or number of iterations.
 
     random_state : int, RandomState instance or None, optional
@@ -594,7 +588,7 @@ class CLARA(BaseEstimator, ClusterMixin, TransformerMixin):
     Notes
     -----
     Contrary to KMedoids, CLARA is linear in N the sample size for both the spacial
-    and time complexity. On the other hand, it scales quadratically with sampling_size.
+    and time complexity. On the other hand, it scales quadratically with n_sampling.
 
     """
 
@@ -604,16 +598,16 @@ def __init__(
         metric="euclidean",
         init="build",
         max_iter=300,
-        sampling_size=None,
-        samples=5,
+        n_sampling=None,
+        n_sampling_iter=5,
         random_state=None,
     ):
         self.n_clusters = n_clusters
         self.metric = metric
         self.init = init
         self.max_iter = max_iter
-        self.sampling_size = sampling_size
-        self.samples = samples
+        self.n_sampling = n_sampling
+        self.n_sampling_iter = n_sampling_iter
         self.random_state = random_state
 
     def fit(self, X, y=None):
@@ -622,7 +616,7 @@ def fit(self, X, y=None):
         Parameters
         ----------
         X : array-like, shape = (n_samples, n_features), \
-                or (n_samples, n_samples) if metric == 'precomputed'
+                or (n_n_sampling_iter, n_n_sampling_iter) if metric == 'precomputed'
             Dataset to cluster.
 
         y : Ignored
@@ -631,26 +625,26 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X)
+        X = check_array(X, dtype=[np.float64, np.float32])
         n = len(X)
 
         random_state_ = check_random_state(self.random_state)
 
-        if self.sampling_size is None:
-            sampling_size = max(
+        if self.n_sampling is None:
+            n_sampling = max(
                 min(n, 40 + 2 * self.n_clusters), self.n_clusters + 1
             )
         else:
-            sampling_size = self.sampling_size
+            n_sampling = self.n_sampling
 
-        # Check sampling_size.
+        # Check n_sampling.
 
         if n < self.n_clusters:
             raise ValueError(
                 "sample_size should be greater than self.n_clusters"
             )
 
-        if self.n_clusters >= sampling_size:
+        if self.n_clusters >= n_sampling:
             raise ValueError(
                 "sampling size must be strictly greater than self.n_clusters"
             )
@@ -659,8 +653,8 @@ def fit(self, X, y=None):
             np.arange(n), size=self.n_clusters, replace=False
         )
         best_score = np.inf
-        for _ in range(self.samples):
-            if sampling_size >= n:
+        for _ in range(self.n_sampling_iter):
+            if n_sampling >= n:
                 sample_idxs = np.arange(n)
             else:
                 sample_idxs = np.hstack(
@@ -668,7 +662,7 @@ def fit(self, X, y=None):
                         medoids_idxs,
                         random_state_.choice(
                             np.delete(np.arange(n), medoids_idxs),
-                            size=sampling_size - self.n_clusters,
+                            size=n_sampling - self.n_clusters,
                             replace=False,
                         ),
                     ]
@@ -692,7 +686,7 @@ def fit(self, X, y=None):
 
         self.medoid_indices_ = medoids_idxs
         self.labels_ = np.argmin(self.transform(X), axis=1)
-        self.n_iter_ = self.samples
+        self.n_iter_ = self.n_sampling_iter
 
         return self
 
@@ -730,7 +724,9 @@ def transform(self, X):
         X_new : {array-like, sparse matrix}, shape=(n_query, n_clusters)
             X transformed in the new space of distances to cluster centers.
         """
-        X = check_array(X, accept_sparse=["csr", "csc"])
+        X = check_array(
+            X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32]
+        )
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")
@@ -755,7 +751,9 @@ def predict(self, X):
         labels : array, shape = (n_query,)
             Index of the cluster each sample belongs to.
         """
-        X = check_array(X, accept_sparse=["csr", "csc"])
+        X = check_array(
+            X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32]
+        )
 
         if self.metric == "precomputed":
             check_is_fitted(self, "medoid_indices_")
diff --git a/sklearn_extra/cluster/tests/test_k_medoids.py b/sklearn_extra/cluster/tests/test_k_medoids.py
index 8646b990..89742f3d 100644
--- a/sklearn_extra/cluster/tests/test_k_medoids.py
+++ b/sklearn_extra/cluster/tests/test_k_medoids.py
@@ -370,7 +370,10 @@ def test_clara_consistency_iris():
     X_iris = load_iris()["data"]
 
     clara = CLARA(
-        n_clusters=3, samples=1, sampling_size=len(X_iris), random_state=rng
+        n_clusters=3,
+        n_sampling_iter=1,
+        n_sampling=len(X_iris),
+        random_state=rng,
     )
 
     model = KMedoids(n_clusters=3, init="build", random_state=rng)

From 30652cfe1887e8bc765dc11dcdcbf0d71ccbea0e Mon Sep 17 00:00:00 2001
From: TimotheeMathieu <timothee.mathieu@u-psud.fr>
Date: Fri, 25 Jun 2021 09:21:33 +0200
Subject: [PATCH 16/16] create private function inertia and changelog

---
 doc/changelog.rst                   |  3 ++
 sklearn_extra/cluster/_k_medoids.py | 67 +++++++++++------------------
 2 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/doc/changelog.rst b/doc/changelog.rst
index 36aa2ecc..053b9197 100644
--- a/doc/changelog.rst
+++ b/doc/changelog.rst
@@ -4,6 +4,9 @@ Changelog
 Unreleased
 ----------
 
+- Add `CLARA` (Clustering for Large Applications) which extends k-medoids to
+    be more scalable using a sampling approach.
+    [`#83 <https://github.com/scikit-learn-contrib/scikit-learn-extra/pull/83>`_].
 - Fix `_estimator_type` for :class:`~sklearn_extra.robust` estimators. Fix
   misbehavior of scikit-learn's :class:`~sklearn.model_selection.cross_val_score` and
   :class:`~sklearn.grid_search.GridSearchCV` for :class:`~sklearn_extra.robust.RobustWeightedClassifier`
diff --git a/sklearn_extra/cluster/_k_medoids.py b/sklearn_extra/cluster/_k_medoids.py
index 128b52bf..cccd575c 100644
--- a/sklearn_extra/cluster/_k_medoids.py
+++ b/sklearn_extra/cluster/_k_medoids.py
@@ -24,6 +24,27 @@
 from ._k_medoids_helper import _compute_optimal_swap, _build
 
 
+def _compute_inertia(distances):
+    """Compute inertia of new samples. Inertia is defined as the sum of the
+    sample distances to closest cluster centers.
+
+    Parameters
+    ----------
+    distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
+        Distances to cluster centers.
+
+    Returns
+    -------
+    Sum of sample distances to closest cluster centers.
+    """
+
+    # Define inertia as the sum of the sample-distances
+    # to closest cluster centers
+    inertia = np.sum(np.min(distances, axis=1))
+
+    return inertia
+
+
 class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
     """k-medoids clustering.
 
@@ -270,7 +291,7 @@ def fit(self, X, y=None):
         # the training data to clusters
         self.labels_ = np.argmin(D[medoid_idxs, :], axis=0)
         self.medoid_indices_ = medoid_idxs
-        self.inertia_ = self._compute_inertia(self.transform(X))
+        self.inertia_ = _compute_inertia(self.transform(X))
 
         # Return self to enable method chaining
         return self
@@ -312,7 +333,7 @@ def _update_medoid_idxs_in_place(self, D, labels, medoid_idxs):
 
     def _compute_cost(self, D, medoid_idxs):
         """Compute the cose for a given configuration of the medoids"""
-        return self._compute_inertia(D[:, medoid_idxs])
+        return _compute_inertia(D[:, medoid_idxs])
 
     def transform(self, X):
         """Transforms X to cluster-distance space.
@@ -386,26 +407,6 @@ def predict(self, X):
 
             return pd_argmin
 
-    def _compute_inertia(self, distances):
-        """Compute inertia of new samples. Inertia is defined as the sum of the
-        sample distances to closest cluster centers.
-
-        Parameters
-        ----------
-        distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
-            Distances to cluster centers.
-
-        Returns
-        -------
-        Sum of sample distances to closest cluster centers.
-        """
-
-        # Define inertia as the sum of the sample-distances
-        # to closest cluster centers
-        inertia = np.sum(np.min(distances, axis=1))
-
-        return inertia
-
     def _initialize_medoids(self, D, n_clusters, random_state_):
         """Select initial mediods when beginning clustering."""
 
@@ -677,7 +678,7 @@ def fit(self, X, y=None):
             )
             pam.fit(X[sample_idxs])
             self.cluster_centers_ = pam.cluster_centers_
-            self.inertia_ = self._compute_inertia(self.transform(X))
+            self.inertia_ = _compute_inertia(self.transform(X))
 
             if pam.inertia_ < best_score:
                 best_score = self.inertia_
@@ -690,26 +691,6 @@ def fit(self, X, y=None):
 
         return self
 
-    def _compute_inertia(self, distances):
-        """Compute inertia of new samples. Inertia is defined as the sum of the
-        sample distances to closest cluster centers.
-
-        Parameters
-        ----------
-        distances : {array-like, sparse matrix}, shape=(n_samples, n_clusters)
-            Distances to cluster centers.
-
-        Returns
-        -------
-        Sum of sample distances to closest cluster centers.
-        """
-
-        # Define inertia as the sum of the sample-distances
-        # to closest cluster centers
-        inertia = np.sum(np.min(distances, axis=1))
-
-        return inertia
-
     def transform(self, X):
         """Transforms X to cluster-distance space.