from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
data_X, data_Y = mnist['data'], mnist['target']
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data_Y)
train_Y_5 = (train_Y == 5)
test_Y_5 = (test_Y == 5)
from sklearn.linear_model import SGDClassifier
sgd_cif = SGDClassifier(random_state=42)
sgd_cif.fit(train_X, train_Y_5)
sgd_cif.predict(test_X[100].reshape(1, -1)), test_Y_5[100]
# decision_function() returns a score for each instance
sgd_cif.decision_function(test_X[100].reshape(1, -1))
from sklearn.model_selection import cross_val_score
scores = cross_val_score(sgd_cif, train_X, train_Y_5, cv = 3, scoring="accuracy") # same as stratified K-fold validation
scores
from sklearn.model_selection import cross_val_predict
train_pred = cross_val_predict(sgd_cif, train_X, train_Y_5, cv = 3)
# cross_val_predict get a clean prediction for each instance in the training set with K-fold cross-validation
from sklearn.metrics import confusion_matrix
confusion_matrix(train_Y_5, train_pred) # actual labels, predicted labels
# Each row represents an actual class
# Each column represents a predicted class
from sklearn.metrics import precision_score, recall_score, f1_score
# precision
precision_score(train_Y_5, train_pred)
# recall
recall_score(train_Y_5, train_pred)
# f1
f1_score(train_Y_5, train_pred)
scores = cross_val_predict(sgd_cif, train_X, train_Y_5, cv = 3, method="decision_function")
# decision score for each instance
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(train_Y_5, scores)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(); # create a figure and a subplot
# Plot two lines
line1, = ax.plot(thresholds, precisions[:-1], 'b--');
line2, = ax.plot(thresholds, recalls[:-1], 'g-');
ax.set_xlabel('Threshold', size=18);
ax.set_ylabel('Probability', size = 18);
legend = ax.legend((line1, line2), ('Precsion', 'Recall'), loc='upper left', shadow=True, facecolor='0.9', bbox_to_anchor=(0.01, 1));
# precision may sometimes go down when raising the threshold
fig, ax = plt.subplots();
ax.plot(recalls, precisions)
ax.set_xlabel('Recall', size=18);
ax.set_ylabel('Probability', size = 18);
# Predict with a specific threshold
import numpy as np
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
y_train_pred_90 = (scores >= threshold_90_precision)
precision_score(train_Y_5, y_train_pred_90), recall_score(train_Y_5, y_train_pred_90)
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(train_Y_5, scores)
fig, ax = plt.subplots();
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate', size=18);
ax.set_ylabel('True Positive Rate', size = 18);
from sklearn.metrics import roc_auc_score
roc_auc_score(train_Y_5, scores)
from sklearn.ensemble import RandomForestClassifier
probs_forest = cross_val_predict(RandomForestClassifier(random_state=42), train_X, train_Y_5, cv=3, method='predict_proba')
# use the positive class's probability as the score
fpr_forest, tpr_forest, thresholds_forest = roc_curve(train_Y_5, probs_forest[:, 1])
fig, ax = plt.subplots();
line_1, = ax.plot(fpr, tpr, 'r-')
line_2, = ax.plot(fpr_forest, tpr_forest, 'b-')
line_3, = ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate', size=18);
ax.set_ylabel('True Positive Rate', size = 18);
legend = ax.legend((line_1, line_2, line_3), ('SGD', 'RFT', '--'), loc='lower right', shadow=True, facecolor='0.9');
roc_auc_score(train_Y_5, probs_forest[:, 1])
# Random Forest Classifer
scores = cross_val_score(RandomForestClassifier(random_state=42), train_X, train_Y, cv=3, scoring='f1_macro')
scores.mean(), scores.std()
# SVM Classifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
svm_clf = SVC();
svm_clf.fit(train_X_scaled, train_Y)
svm_clf.predict(scaler.transform(test_X[100].reshape(1, -1))), test_Y[100]
train_pred = cross_val_predict(RandomForestClassifier(random_state=42), train_X, train_Y, cv=3)
conf_mx = confusion_matrix(train_Y, train_pred)
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx/row_sums
import seaborn as sns
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(norm_conf_mx, vmax=0.1, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": 0.7}, ax = ax)
ax.set_xlabel('Prediction', size=18);
ax.set_ylabel('Actual', size = 18);
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
y_train_large = (train_Y >= 7)
y_train_odd = (train_Y % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd] # create multiple labels
# Measure the F1 score for each individual label, then simply compute the average score
y_train_knn_pred = cross_val_predict(KNeighborsClassifier(), train_X, y_multilabel, cv=3, n_jobs=-1)
y_train_knn_pred
f1_score(y_multilabel, y_train_knn_pred, average="macro")
noise = np.random.randint(0, 100, (len(train_X), 784))
X_train_mod = train_X + noise
noise = np.random.randint(0, 100, (len(test_X), 784))
X_test_mod = test_X + noise
y_train_mod = train_X
y_test_mod = test_X
import matplotlib
import matplotlib.pyplot as plt
plt.imshow(X_test_mod[5500].reshape(28, 28), cmap = matplotlib.cm.binary, interpolation='nearest')
plt.imshow(test_X[5500].reshape(28, 28), cmap = matplotlib.cm.binary, interpolation='nearest')
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[5500]])
plt.imshow(clean_digit.reshape(28, 28), cmap = matplotlib.cm.binary, interpolation='nearest')