Add CLARA Clustering algorithm (#83)

* add CLARA * add example * fix typo * add doc * fix docstring * add CLARA to test_common * add size check to pass tests * fix tests * update doc * add test consistency clara kmedoids * black * handle types KMedoids * Apply suggestions from code review Co-authored-by: Roman Yurchak <[email protected]> * correct 32 bit * change name variables * create private function inertia and changelog Co-authored-by: Roman Yurchak <[email protected]>
scikit-learn-contrib · Jun 25, 2021 · 5c47ba2 · 5c47ba2
1 parent 445aaf8
commit 5c47ba2
Show file tree

Hide file tree

Showing 8 changed files with 470 additions and 33 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -32,6 +32,7 @@ Clustering
 
    cluster.KMedoids
    cluster.CommonNNClustering
+   cluster.CLARA
 
 Robust
 ====================

diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -4,6 +4,9 @@ Changelog
 Unreleased
 ----------
 
+- Add `CLARA` (Clustering for Large Applications) which extends k-medoids to
+    be more scalable using a sampling approach.
+    [`#83 <https://github.com/scikit-learn-contrib/scikit-learn-extra/pull/83>`_].
 - Fix `_estimator_type` for :class:`~sklearn_extra.robust` estimators. Fix
   misbehavior of scikit-learn's :class:`~sklearn.model_selection.cross_val_score` and
   :class:`~sklearn.grid_search.GridSearchCV` for :class:`~sklearn_extra.robust.RobustWeightedClassifier`

diff --git a/doc/modules/cluster.rst b/doc/modules/cluster.rst
@@ -1,8 +1,8 @@
 .. _cluster:
 
-=====================================================
-Clustering with KMedoids and Common-nearest-neighbors
-=====================================================
+============================================================
+Clustering with KMedoids, CLARA and Common-nearest-neighbors
+============================================================
 .. _k_medoids:
 
 K-Medoids
@@ -82,6 +82,38 @@ when speed is an issue.
     for performing face recognition. International Journal of Soft Computing,
     Mathematics and Control, 3(3), pp 1-12.
 
+
+
+CLARA
+=====
+
+    :class:`CLARA` is related to the :class:`KMedoids` algorithm. CLARA
+    (Clustering for Large Applications) extends k-medoids to be more scalable,
+    uses a sampling approach.
+
+    .. topic:: Examples:
+
+      * :ref:`sphx_glr_auto_examples_plot_clara_digits.py`: Applying K-Medoids on digits
+        with various distance metrics.
+
+
+    **Algorithm description:**
+    CLARA uses random samples of the dataset, each of size `sampling_size`
+    The algorith is iterative, first we select one sub-sample, then CLARA applies
+    KMedoids on this sub-sample to obtain `n_clusters` medoids. At the next step,
+    CLARA sample `sampling_size`-`n_clusters` from the dataset and the next sub-sample
+    is composed of the best medoids found until now (with respect to inertia in the
+    whole dataset, not the inertia only on the sub-sample) to which we add the new
+    samples just drawn. Then, K-Medoids is applied to this new sub-sample, and loop
+    back until `sample` sub-samples have been used.
+
+
+    .. topic:: References:
+
+      * Kaufman, L. and Rousseeuw, P.J. (2008). Clustering Large Applications (Program CLARA).
+        In Finding Groups in Data (eds L. Kaufman and P.J. Rousseeuw).
+        doi:10.1002/9780470316801.ch2
+
 .. _commonnn:
 
 Common-nearest-neighbors clustering

diff --git a/examples/plot_clara_digits.py b/examples/plot_clara_digits.py
@@ -0,0 +1,121 @@
+"""
+======================================================================
+A demo of K-Medoids vs CLARA clustering on the handwritten digits data
+======================================================================
+In this example we compare different computation time of K-Medoids and CLARA on
+the handwritten digits data.
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+
+from sklearn_extra.cluster import KMedoids, CLARA
+from sklearn.datasets import load_digits
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import scale
+
+print(__doc__)
+
+# Authors: Timo Erkkilä <[email protected]>
+#          Antti Lehmussola <[email protected]>
+#          Kornel Kiełczewski <[email protected]>
+# License: BSD 3 clause
+
+np.random.seed(42)
+
+digits = load_digits()
+data = scale(digits.data)
+n_digits = len(np.unique(digits.target))
+
+reduced_data = PCA(n_components=2).fit_transform(data)
+
+# Step size of the mesh. Decrease to increase the quality of the VQ.
+h = 0.02  # point in the mesh [x_min, m_max]x[y_min, y_max].
+
+# Plot the decision boundary. For that, we will assign a color to each
+x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
+y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+plt.figure()
+plt.clf()
+
+plt.suptitle(
+    "Comparing KMedoids and CLARA",
+    fontsize=14,
+)
+
+
+selected_models = [
+    (
+        KMedoids(metric="cosine", n_clusters=n_digits),
+        "KMedoids (cosine)",
+    ),
+    (
+        KMedoids(metric="manhattan", n_clusters=n_digits),
+        "KMedoids (manhattan)",
+    ),
+    (
+        CLARA(
+            metric="cosine",
+            n_clusters=n_digits,
+            init="heuristic",
+            n_sampling=50,
+        ),
+        "CLARA (cosine)",
+    ),
+    (
+        CLARA(
+            metric="manhattan",
+            n_clusters=n_digits,
+            init="heuristic",
+            n_sampling=50,
+        ),
+        "CLARA (manhattan)",
+    ),
+]
+
+plot_rows = int(np.ceil(len(selected_models) / 2.0))
+plot_cols = 2
+
+for i, (model, description) in enumerate(selected_models):
+
+    # Obtain labels for each point in mesh. Use last trained model.
+    init_time = time.time()
+    model.fit(reduced_data)
+    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
+    computation_time = time.time() - init_time
+
+    # Put the result into a color plot
+    Z = Z.reshape(xx.shape)
+    plt.subplot(plot_cols, plot_rows, i + 1)
+    plt.imshow(
+        Z,
+        interpolation="nearest",
+        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+        cmap=plt.cm.Paired,
+        aspect="auto",
+        origin="lower",
+    )
+
+    plt.plot(
+        reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=2, alpha=0.3
+    )
+    # Plot the centroids as a white X
+    centroids = model.cluster_centers_
+    plt.scatter(
+        centroids[:, 0],
+        centroids[:, 1],
+        marker="x",
+        s=169,
+        linewidths=3,
+        color="w",
+        zorder=10,
+    )
+    plt.title(description + ": %.2Fs" % (computation_time))
+    plt.xlim(x_min, x_max)
+    plt.ylim(y_min, y_max)
+    plt.xticks(())
+    plt.yticks(())
+
+plt.show()
diff --git a/sklearn_extra/cluster/__init__.py b/sklearn_extra/cluster/__init__.py
@@ -1,4 +1,4 @@
-from ._k_medoids import KMedoids
+from ._k_medoids import KMedoids, CLARA
 from ._commonnn import commonnn, CommonNNClustering
 
-__all__ = ["KMedoids", "CommonNNClustering", "commonnn"]
+__all__ = ["KMedoids", "CLARA", "CommonNNClustering", "commonnn"]