from sklearn.datasets import load_iris
data = load_iris()
X = data.data
y = data.target
data.target_names
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
from sklearn.mixture import GaussianMixture
model = GaussianMixture(n_components=3, random_state=42)
model.fit(X)
y_pred = model.predict(X) # hard clustering
y_pred_prob = model.predict_proba(X) # soft clustering
model.weights_ # phi
model.means_ # mu
model.covariances_ # covariance matrices
import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
# sample new instances
X_new, y_new = model.sample(100)
# log value of the probability density function for each sample
model.score_samples(X_new)
import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))
plt.plot(X_new[y_new==0, 2], X_new[y_new==0, 3], "yo", label="Cluster 1")
plt.plot(X_new[y_new==1, 2], X_new[y_new==1, 3], "bs", label="Cluster 2")
plt.plot(X_new[y_new==2, 2], X_new[y_new==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
import numpy as np
contamination = 4 # 4% of the the samples are outliers
densities = model.score_samples(X)
density_threshold = np.percentile(densities, 4)
anomalies = X[densities < density_threshold]
import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.plot(anomalies[:, 2], anomalies[:, 3], 'rx', label="Outlier")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
$\hat{L}$, the maximized value of the likelihood function of the model
Choose k which has the lowest BIC and AIC
bics = np.empty(10)
aics = np.empty(10)
for n_components in range(1, 10):
model = GaussianMixture(n_components=n_components, random_state=42)
model.fit(X)
bics[n_components] = model.bic(X)
aics[n_components] = model.aic(X)
import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))
plt.plot(range(1, 10), bics[1:], "yo-", label="BIC")
plt.plot(range(1, 10), aics[1:], "bs-", label="AIC")
plt.xlabel("k", fontsize=14)
plt.ylabel("Information Criterion", fontsize=14)
plt.legend(loc="upper left", fontsize=12)
from sklearn.mixture import BayesianGaussianMixture
model = BayesianGaussianMixture(n_components=10, n_init=10)
model.fit(X)
np.round(model.weights_, 2)
y_pred = model.predict(X)
import matplotlib.pyplot as plt
plt.figure(figsize=(9, 3.5))
plt.plot(X[y_pred==0, 2], X[y_pred==0, 3], "yo", label="Cluster 1")
plt.plot(X[y_pred==1, 2], X[y_pred==1, 3], "bs", label="Cluster 2")
plt.plot(X[y_pred==2, 2], X[y_pred==2, 3], "g^", label="Cluster 3")
plt.xlabel("Petal length", fontsize=14)
plt.ylabel("Petal width", fontsize=14)
plt.legend(loc="upper left", fontsize=12)