from sklearn.datasets import load_iris
data = load_iris()
X = data.data
y = data.target
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3, random_state=42)
y_pred = model.fit_predict(X)


import numpy as np
mapping = np.array([1, 2, 0])
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])


np.sum(y_pred==y) / len(y_pred)

0.03333333333333333


import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))

plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)

<matplotlib.legend.Legend at 0x7fe17a1eaeb0>


from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
y_pred = kmeans.fit_predict(X)


kmeans.labels_, kmeans.cluster_centers_

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
        2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
        2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32),
 array([[0.19611111, 0.595     , 0.07830508, 0.06083333],
        [0.44125683, 0.30737705, 0.57571548, 0.54918033],
        [0.70726496, 0.4508547 , 0.79704476, 0.82478632]]))


# predict
kmeans.predict(X[10, :].reshape(1, -1))

array([0], dtype=int32)


# distance to each centroid
# nonlinear dimensionality reduction technique
kmeans.transform(X[10, :].reshape(1, -1))

array([[0.15884387, 0.82328741, 1.16116836]])


import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))

plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)

<matplotlib.legend.Legend at 0x7fd6eee969d0>


from sklearn.metrics import silhouette_score
silhouette_score(X, y_pred)

0.5047687565398589


from sklearn.metrics import silhouette_samples
silhouette_coefficients = silhouette_samples(X, y_pred)


from sklearn.cluster import MiniBatchKMeans
minibatch_kmeans = MiniBatchKMeans(n_clusters=3)
n_batches = 3
for X_batch in np.array_split(X, 3):
    minibatch_kmeans.partial_fit(X_batch)


X.tofile('temp.dat', sep='') # save data to a binary file


# manipulate a large array stored in a binary file on disk as if it is entirely in memory
X_mm = np.memmap('temp.dat', dtype='float64', mode='readonly', shape=(150, 4))
batch_size = 3
minibatch_kmeans = MiniBatchKMeans(n_clusters=3, batch_size=batch_size)
minibatch_kmeans.fit(X_mm)

MiniBatchKMeans(batch_size=3, compute_labels=True, init='k-means++',
                init_size=None, max_iter=100, max_no_improvement=10,
                n_clusters=3, n_init=3, random_state=None,
                reassignment_ratio=0.01, tol=0.0, verbose=0)


y_pred = minibatch_kmeans.labels_


plt.figure(figsize=(9, 3.5))

plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)

<matplotlib.legend.Legend at 0x7fd6eef7ac90>


from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.05)


# choose eps
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)

distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

[<matplotlib.lines.Line2D at 0x7fe1b92bfa00>]


# choose eps to be 0.06, 0.05 will create seven clusters
# min_samples depends on domain knowledge
dbscan = DBSCAN(eps=0.05, min_samples=5) 
dbscan.fit(X)

DBSCAN(eps=0.05)


#labels of instances
# -1, considered as anomalies
dbscan.labels_

# indices of the core instances
dbscan.core_sample_indices_.shape
# core instances
dbscan.components_

array([[ 0.92568308,  0.40760832],
       [-1.06801473,  0.23598424],
       [-0.8536932 ,  0.49789903],
       ...,
       [ 0.54786442, -0.41164933],
       [-0.08532256,  0.97242348],
       [ 0.02807615,  1.02441327]])


import matplotlib.pyplot as plt
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=dbscan.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=dbscan.labels_)

<matplotlib.collections.PathCollection at 0x7fe1c90e93a0>


#### DBSCAN in sklearn has no predict
#### Users can choose which classification algorithms to use
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

KNeighborsClassifier(n_neighbors=50)


X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
knn.predict(X_new)

array([1, 0, 1, 0])


X, y = make_moons(n_samples=1000, noise=0.05)


from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(linkage='single') # default n_clusters is 2
clustering.fit(X)

AgglomerativeClustering(linkage='single')


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=clustering.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=clustering.labels_)

<matplotlib.collections.PathCollection at 0x7fe1c89f3a00>


from sklearn.neighbors import kneighbors_graph

# scale nice if provide a m*m sparse distance matrix
# without a connectivity matrix, the algorithm does not scale well
A = kneighbors_graph(X, 30, include_self=False)
clustering = AgglomerativeClustering(linkage='ward', connectivity=A, n_clusters=2)
clustering.fit(X)

AgglomerativeClustering(connectivity=<1000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 30000 stored elements in Compressed Sparse Row format>)


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=clustering.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=clustering.labels_)

<matplotlib.collections.PathCollection at 0x7fe168e4baf0>


from sklearn.cluster import Birch
brc = Birch(n_clusters=2)
brc.fit(X)
labels = brc.predict(X)


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)

<matplotlib.collections.PathCollection at 0x7fe168f23160>


# mini-batch
brc = Birch(n_clusters=2)
for i in range(10):
    brc.partial_fit(X[0:(i+1)*100, :2])
    
brc.set_params(n_clusters=2)
labels = brc.predict(X)


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)

<matplotlib.collections.PathCollection at 0x7fd6f09a45d0>


from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth

bandwidth = estimate_bandwidth(X, quantile=0.2)

# If bandwidth is not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth
clustering = MeanShift(bandwidth=bandwidth)
clustering.fit(X)
labels = clustering.predict(X)


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)

<matplotlib.collections.PathCollection at 0x7fe1b8f0a370>


from sklearn.cluster import AffinityPropagation

clustering = AffinityPropagation(damping = 0.9, preference = -200)
clustering.fit(X)
labels = clustering.predict(X)


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)

<matplotlib.collections.PathCollection at 0x7fe1b8f0a640>


from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters=2, gamma=20, random_state=42)
sc.fit(X)

SpectralClustering(gamma=20, n_clusters=2, random_state=42)


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=sc.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=sc.labels_)

<matplotlib.collections.PathCollection at 0x7fe1b8afcd00>


from sklearn.cluster import OPTICS
clustering = OPTICS(min_samples=30)
clustering.fit(X)

OPTICS(min_samples=30)


fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=clustering.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=clustering.labels_)

<matplotlib.collections.PathCollection at 0x7fe188be6580>


from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]

metrics.adjusted_rand_score(labels_true, labels_pred)

0.24242424242424246


labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]

metrics.adjusted_mutual_info_score(labels_true, labels_pred)
metrics.normalized_mutual_info_score(labels_true, labels_pred)
metrics.mutual_info_score(labels_true, labels_pred)

0.4620981203732969


labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]

metrics.homogeneity_score(labels_true, labels_pred)
metrics.completeness_score(labels_true, labels_pred)
metrics.v_measure_score(labels_true, labels_pred)

0.5158037429793889


labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]

metrics.fowlkes_mallows_score(labels_true, labels_pred)

0.4714045207910317


from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics
X, y = datasets.load_iris(return_X_y=True)

k = 3
kmeans_model = KMeans(n_clusters=k, random_state=1).fit(X)
silhouette_score = metrics.silhouette_score(X, kmeans_model.labels_, metric='euclidean')


from sklearn.metrics import silhouette_samples
y_pred = kmeans_model.labels_
silhouette_coefficients = silhouette_samples(X, y_pred)


import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

padding = 10
pos = 0
for i in range(k):
    coeffs = silhouette_coefficients[y_pred == i]
    coeffs.sort()
    
    color = mpl.cm.Spectral(i/k)
    plt.fill_betweenx(np.arange(pos, pos+len(coeffs)), 0, coeffs, facecolor = color, edgecolor = color, alpha = 0.7)
    pos += len(coeffs) + padding
plt.axvline(x=silhouette_score, color = 'red', linestyle = "--")

<matplotlib.lines.Line2D at 0x7fe0b96c8610>


metrics.calinski_harabasz_score(X, kmeans_model.labels_)

561.62775662962


metrics.davies_bouldin_score(X, kmeans_model.labels_)

0.6619715465007528


from sklearn.metrics.cluster import contingency_matrix
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]

contingency_matrix(labels_true, labels_pred)

array([[2, 1, 0],
       [0, 1, 2]])

Clustering¶

Application¶

1. Gaussian Mixture¶

2. K-Means¶

Finding the Optimal Number of Clusters¶

Mini-Batch K-Means¶

Semi-Supervised Learning¶

3. DBSCAN¶

4. Agglomerative Clustering¶

5. BIRCH¶

6. Mean-Shift¶

Affinity Propagation¶

7. Spectral Clustering¶

8. Ordering Points To Identify the Clustering Structure (OPTICS)¶

Clustering performance evaluation¶

1. Adjusted Rand index¶

2. Mutual Information based scores¶

3. Homogeneity, completeness and V-measure¶

4. Fowlkes-Mallows scores¶

5. Silhouette Coefficient¶

6. Calinski-Harabasz Index¶

7. Davies-Bouldin Index¶

8. Contingency Matrix¶

Reference¶