from sklearn.datasets import load_iris
data = load_iris()
X = data.data
y = data.target
data.target_names
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3, random_state=42)
y_pred = model.fit_predict(X)
import numpy as np
mapping = np.array([1, 2, 0])
y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])
np.sum(y_pred==y) / len(y_pred)
0.03333333333333333
import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
<matplotlib.legend.Legend at 0x7fe17a1eaeb0>
Steps
Cons
* Prefer clusters of similar sizes
* Does not behave very well when the clusters have varying sizes, different densities, or nonspherical shapes
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
y_pred = kmeans.fit_predict(X)
kmeans.labels_, kmeans.cluster_centers_
(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1], dtype=int32), array([[0.19611111, 0.595 , 0.07830508, 0.06083333], [0.44125683, 0.30737705, 0.57571548, 0.54918033], [0.70726496, 0.4508547 , 0.79704476, 0.82478632]]))
# predict
kmeans.predict(X[10, :].reshape(1, -1))
array([0], dtype=int32)
# distance to each centroid
# nonlinear dimensionality reduction technique
kmeans.transform(X[10, :].reshape(1, -1))
array([[0.15884387, 0.82328741, 1.16116836]])
import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
<matplotlib.legend.Legend at 0x7fd6eee969d0>
from sklearn.metrics import silhouette_score
silhouette_score(X, y_pred)
0.5047687565398589
from sklearn.metrics import silhouette_samples
silhouette_coefficients = silhouette_samples(X, y_pred)
from sklearn.cluster import MiniBatchKMeans
minibatch_kmeans = MiniBatchKMeans(n_clusters=3)
n_batches = 3
for X_batch in np.array_split(X, 3):
minibatch_kmeans.partial_fit(X_batch)
X.tofile('temp.dat', sep='') # save data to a binary file
# manipulate a large array stored in a binary file on disk as if it is entirely in memory
X_mm = np.memmap('temp.dat', dtype='float64', mode='readonly', shape=(150, 4))
batch_size = 3
minibatch_kmeans = MiniBatchKMeans(n_clusters=3, batch_size=batch_size)
minibatch_kmeans.fit(X_mm)
MiniBatchKMeans(batch_size=3, compute_labels=True, init='k-means++', init_size=None, max_iter=100, max_no_improvement=10, n_clusters=3, n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0, verbose=0)
y_pred = minibatch_kmeans.labels_
plt.figure(figsize=(9, 3.5))
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
<matplotlib.legend.Legend at 0x7fd6eef7ac90>
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=1000, noise=0.05)
# choose eps
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X)
distances, indices = nbrs.kneighbors(X)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
[<matplotlib.lines.Line2D at 0x7fe1b92bfa00>]
# choose eps to be 0.06, 0.05 will create seven clusters
# min_samples depends on domain knowledge
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)
DBSCAN(eps=0.05)
#labels of instances
# -1, considered as anomalies
dbscan.labels_
# indices of the core instances
dbscan.core_sample_indices_.shape
# core instances
dbscan.components_
array([[ 0.92568308, 0.40760832], [-1.06801473, 0.23598424], [-0.8536932 , 0.49789903], ..., [ 0.54786442, -0.41164933], [-0.08532256, 0.97242348], [ 0.02807615, 1.02441327]])
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=dbscan.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=dbscan.labels_)
<matplotlib.collections.PathCollection at 0x7fe1c90e93a0>
#### DBSCAN in sklearn has no predict
#### Users can choose which classification algorithms to use
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])
KNeighborsClassifier(n_neighbors=50)
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
knn.predict(X_new)
array([1, 0, 1, 0])
Hierical Clustering (bottom-up)
Defining closest pair of clusters
Pros
Cons
Complexity
X, y = make_moons(n_samples=1000, noise=0.05)
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(linkage='single') # default n_clusters is 2
clustering.fit(X)
AgglomerativeClustering(linkage='single')
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=clustering.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=clustering.labels_)
<matplotlib.collections.PathCollection at 0x7fe1c89f3a00>
from sklearn.neighbors import kneighbors_graph
# scale nice if provide a m*m sparse distance matrix
# without a connectivity matrix, the algorithm does not scale well
A = kneighbors_graph(X, 30, include_self=False)
clustering = AgglomerativeClustering(linkage='ward', connectivity=A, n_clusters=2)
clustering.fit(X)
AgglomerativeClustering(connectivity=<1000x1000 sparse matrix of type '<class 'numpy.float64'>' with 30000 stored elements in Compressed Sparse Row format>)
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=clustering.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=clustering.labels_)
<matplotlib.collections.PathCollection at 0x7fe168e4baf0>
from sklearn.cluster import Birch
brc = Birch(n_clusters=2)
brc.fit(X)
labels = brc.predict(X)
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)
<matplotlib.collections.PathCollection at 0x7fe168f23160>
# mini-batch
brc = Birch(n_clusters=2)
for i in range(10):
brc.partial_fit(X[0:(i+1)*100, :2])
brc.set_params(n_clusters=2)
labels = brc.predict(X)
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)
<matplotlib.collections.PathCollection at 0x7fd6f09a45d0>
Steps
Pros
Cons
Complexity
from sklearn.cluster import MeanShift
from sklearn.cluster import estimate_bandwidth
bandwidth = estimate_bandwidth(X, quantile=0.2)
# If bandwidth is not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth
clustering = MeanShift(bandwidth=bandwidth)
clustering.fit(X)
labels = clustering.predict(X)
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)
<matplotlib.collections.PathCollection at 0x7fe1b8f0a370>
from sklearn.cluster import AffinityPropagation
clustering = AffinityPropagation(damping = 0.9, preference = -200)
clustering.fit(X)
labels = clustering.predict(X)
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=labels, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=labels)
<matplotlib.collections.PathCollection at 0x7fe1b8f0a640>
Steps
Pros
from sklearn.cluster import SpectralClustering
sc = SpectralClustering(n_clusters=2, gamma=20, random_state=42)
sc.fit(X)
SpectralClustering(gamma=20, n_clusters=2, random_state=42)
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=sc.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=sc.labels_)
<matplotlib.collections.PathCollection at 0x7fe1b8afcd00>
Steps
Pros
from sklearn.cluster import OPTICS
clustering = OPTICS(min_samples=30)
clustering.fit(X)
OPTICS(min_samples=30)
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)
ax.scatter(X[:, 0], X[:, 1], c=clustering.labels_, marker='o', s=600, cmap="Paired")
ax.scatter(X[:, 0], X[:, 1], marker='*', s=20, c=clustering.labels_)
<matplotlib.collections.PathCollection at 0x7fe188be6580>
Perfect labeling is scored 1.0, bad have negative or close to 0.0 scores
Pros
from sklearn import metrics
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
metrics.adjusted_rand_score(labels_true, labels_pred)
0.24242424242424246
Perfect labeling is scored 1.0, bad have non-positive scores
Prons
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
metrics.adjusted_mutual_info_score(labels_true, labels_pred)
metrics.normalized_mutual_info_score(labels_true, labels_pred)
metrics.mutual_info_score(labels_true, labels_pred)
0.4620981203732969
V-measure: harmonic mean of homogeneity and completeness
Prons
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
metrics.homogeneity_score(labels_true, labels_pred)
metrics.completeness_score(labels_true, labels_pred)
metrics.v_measure_score(labels_true, labels_pred)
0.5158037429793889
Permute the predicted labels, and get the same score
Prons
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
metrics.fowlkes_mallows_score(labels_true, labels_pred)
0.4714045207910317
a higher Silhouette Coefficient score relates to a model with better defined clusters
Prons
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn import metrics
X, y = datasets.load_iris(return_X_y=True)
k = 3
kmeans_model = KMeans(n_clusters=k, random_state=1).fit(X)
silhouette_score = metrics.silhouette_score(X, kmeans_model.labels_, metric='euclidean')
from sklearn.metrics import silhouette_samples
y_pred = kmeans_model.labels_
silhouette_coefficients = silhouette_samples(X, y_pred)
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
padding = 10
pos = 0
for i in range(k):
coeffs = silhouette_coefficients[y_pred == i]
coeffs.sort()
color = mpl.cm.Spectral(i/k)
plt.fill_betweenx(np.arange(pos, pos+len(coeffs)), 0, coeffs, facecolor = color, edgecolor = color, alpha = 0.7)
pos += len(coeffs) + padding
plt.axvline(x=silhouette_score, color = 'red', linestyle = "--")
<matplotlib.lines.Line2D at 0x7fe0b96c8610>
The index is the ratio of the sum of between-clusters dispersion and of inter-cluster dispersion for all clusters
Prons
metrics.calinski_harabasz_score(X, kmeans_model.labels_)
561.62775662962
Zero is the lowest possible score. Values closer to zero indicate a better partition
Prons
metrics.davies_bouldin_score(X, kmeans_model.labels_)
0.6619715465007528
from sklearn.metrics.cluster import contingency_matrix
labels_true = [0, 0, 0, 1, 1, 1]
labels_pred = [0, 0, 1, 1, 2, 2]
contingency_matrix(labels_true, labels_pred)
array([[2, 1, 0], [0, 1, 2]])