
Get the Data

In [1]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
data_X, data_Y = mnist['data'], mnist['target']
Prepare Training Set and Test Set

In [2]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data_Y)
In [3]:
train_Y_5 = (train_Y == 5)
test_Y_5 = (test_Y == 5)

Binary Classifier

In [4]:
from sklearn.linear_model import SGDClassifier

sgd_cif = SGDClassifier(random_state=42), train_Y_5)
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
In [12]:
sgd_cif.predict(test_X[100].reshape(1, -1)), test_Y_5[100]
(array([False]), False)
In [18]:
# decision_function() returns a score for each instance
sgd_cif.decision_function(test_X[100].reshape(1, -1))


In [19]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(sgd_cif, train_X, train_Y_5, cv = 3, scoring="accuracy") # same as stratified K-fold validation
array([0.96710773, 0.96925055, 0.96255223])

Confusion Matrix

In [20]:
from sklearn.model_selection import cross_val_predict
train_pred = cross_val_predict(sgd_cif, train_X, train_Y_5, cv = 3)
# cross_val_predict get a clean prediction for each instance in the training set with K-fold cross-validation
In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix(train_Y_5, train_pred) # actual labels, predicted labels
# Each row represents an actual class
# Each column represents a predicted class
array([[50445,   504],
       [ 1383,  3668]])

Precision: the accuracy of the positive predictions

$$precison = \dfrac{TP}{TP+FP}$$

Recall: sensitivey or the true positive rate (TPR)

$$recall = \dfrac{TP}{TP+FN}$$


$$accuracy = \dfrac{TP+TN}{TP+TN+FP+FN}$$

F1: harmonic mean

$$f_{1} = \dfrac{2TP}{2TP+FP+FN}$$
In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score

# precision
precision_score(train_Y_5, train_pred)
# recall
recall_score(train_Y_5, train_pred)
# f1
f1_score(train_Y_5, train_pred)

Precision/Recall Tradeoff

  • Model computes a score based on a decison function, and if the score is greater than a threshold, it assigns the instance to the positive class, or else it assigns it to the negative class
  • The precision may sometimes go down when you raise the threshold
In [24]:
scores = cross_val_predict(sgd_cif, train_X, train_Y_5, cv = 3, method="decision_function")
# decision score for each instance
In [28]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(train_Y_5, scores)
In [30]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(); # create a figure and a subplot

# Plot two lines
line1, = ax.plot(thresholds, precisions[:-1], 'b--');
line2, = ax.plot(thresholds, recalls[:-1], 'g-');
ax.set_xlabel('Threshold', size=18);
ax.set_ylabel('Probability', size = 18);

legend = ax.legend((line1, line2), ('Precsion', 'Recall'), loc='upper left', shadow=True, facecolor='0.9', bbox_to_anchor=(0.01, 1));
# precision may sometimes go down when raising the threshold
In [31]:
fig, ax = plt.subplots();

ax.plot(recalls, precisions)

ax.set_xlabel('Recall', size=18);
ax.set_ylabel('Probability', size = 18);
In [46]:
# Predict with a specific threshold
import numpy as np
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

y_train_pred_90 = (scores >= threshold_90_precision)

precision_score(train_Y_5, y_train_pred_90), recall_score(train_Y_5, y_train_pred_90)
(0.9000519210799585, 0.6863987329241734)

The Receiver Operating Characteristic (ROC) Curve

  • True Positive Rate (TPR) v.s. False Positive Rate (FPR), FPR = 1 - TNR, TNR is called specificity, so ROC curve plots sensitivity (recall) v.s. 1-specificity
  • A good classifier says as far away from the straight line
  • Comparing classifiers by measuring the area under the curve (AUC), a perfect classifer will have a ROC AUC equal to 1, whereas a purely random classifier will have a ROC AUC equal to 0.5
In [47]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(train_Y_5, scores)
In [48]:
fig, ax = plt.subplots();

ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], 'k--')

ax.set_xlabel('False Positive Rate', size=18);
ax.set_ylabel('True Positive Rate', size = 18);
In [50]:
from sklearn.metrics import roc_auc_score

roc_auc_score(train_Y_5, scores)
In [51]:
from sklearn.ensemble import RandomForestClassifier

probs_forest = cross_val_predict(RandomForestClassifier(random_state=42), train_X, train_Y_5, cv=3, method='predict_proba')
In [52]:
# use the positive class's probability as the score
fpr_forest, tpr_forest, thresholds_forest = roc_curve(train_Y_5, probs_forest[:, 1])
In [53]:
fig, ax = plt.subplots();

line_1, = ax.plot(fpr, tpr, 'r-')
line_2, = ax.plot(fpr_forest, tpr_forest, 'b-')
line_3, = ax.plot([0, 1], [0, 1], 'k--')

ax.set_xlabel('False Positive Rate', size=18);
ax.set_ylabel('True Positive Rate', size = 18);
legend = ax.legend((line_1, line_2, line_3), ('SGD', 'RFT', '--'), loc='lower right', shadow=True, facecolor='0.9');
In [54]:
roc_auc_score(train_Y_5, probs_forest[:, 1])

PR curve or ROC curve

  • PR curve, the positive class is rare or care more FP than FN
  • ROC curve, otherwise

Multiclass Classification

  • Algorithm can handle multiple class directly, such as SGD classifier, Random Forest classifiers or naive Bayes classifiers
  • Algorithm are strictly binary classfiers, such as Logistic Regression, or Support Vector Machine classifiers
    • one-versus-the-rest (OvR) or one-versus-all (OvA), default, N categories need N classifiers
    • one-versus-one (OvO), SVM, N categories need N*(N-1)/2 classifiers
    • for most binary classification algorithms, OvR is prefered
In [108]:
# Random Forest Classifer
scores = cross_val_score(RandomForestClassifier(random_state=42), train_X, train_Y, cv=3, scoring='f1_macro')
In [111]:
scores.mean(), scores.std()
(0.9407508281367417, 0.001924204491467725)
In [7]:
# SVM Classifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)

svm_clf = SVC();, train_Y)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
In [11]:
svm_clf.predict(scaler.transform(test_X[100].reshape(1, -1))), test_Y[100]
(array([1.]), 1.0)

Error Analysis

In [17]:
train_pred = cross_val_predict(RandomForestClassifier(random_state=42), train_X, train_Y, cv=3)
In [23]:
conf_mx = confusion_matrix(train_Y, train_pred)
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx/row_sums

import seaborn as sns
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(norm_conf_mx, vmax=0.1, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": 0.7}, ax = ax)
ax.set_xlabel('Prediction', size=18);
ax.set_ylabel('Actual', size = 18);

Multilabel Classification

  • Outputs multiple binary labels
In [24]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (train_Y >= 7)
y_train_odd = (train_Y % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd] # create multiple labels
In [25]:
# Measure the F1 score for each individual label, then simply compute the average score
y_train_knn_pred = cross_val_predict(KNeighborsClassifier(), train_X, y_multilabel, cv=3, n_jobs=-1)
In [29]:
f1_score(y_multilabel, y_train_knn_pred, average="macro")

Multioutput Classification

In [142]:
noise = np.random.randint(0, 100, (len(train_X), 784))
X_train_mod = train_X + noise
noise = np.random.randint(0, 100, (len(test_X), 784))
X_test_mod = test_X + noise
y_train_mod = train_X
y_test_mod = test_X
In [143]:
import matplotlib
import matplotlib.pyplot as plt
plt.imshow(X_test_mod[5500].reshape(28, 28), cmap =, interpolation='nearest')
<matplotlib.image.AxesImage at 0x1a27765748>
In [144]:
plt.imshow(test_X[5500].reshape(28, 28), cmap =, interpolation='nearest')
<matplotlib.image.AxesImage at 0x1a277ca240>
In [146]:
knn_clf = KNeighborsClassifier(), y_train_mod)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
In [148]:
clean_digit = knn_clf.predict([X_test_mod[5500]])
plt.imshow(clean_digit.reshape(28, 28), cmap =, interpolation='nearest')
<matplotlib.image.AxesImage at 0x1a279ccda0>