Curves

In [25]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
data_X, data_Y = mnist['data'], mnist['target']
In [29]:
import numpy as np
train_X = data_X
train_Y_5 = (data_Y == '5')
In [ ]:
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
scores = cross_val_predict(model, train_X, train_Y_5, cv = 3, method="decision_function")

Precision-Recall Curve

  • Binary classification
  • Steps
    • classifier computes a score, which is based on decision function or predict probability
    • if a core is greater than a threshold, assign the instance to the positive class; otherwise assign it to the negative class
  • Select a precision/recall trade-off before sharp drop
In [75]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(train_Y_5, scores)
In [83]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

line1, = ax.plot(thresholds, precisions[:-1], 'b--', label='Precision')
line2, = ax.plot(thresholds, recalls[:-1], 'g-', label='Recall')

legend = ax.legend((line1, line2), ('Precision', 'Recall'), shadow=True);
plt.xlabel("Threshold", fontsize=14)
Out[83]:
Text(0.5, 0, 'Threshold')
In [84]:
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

ax.plot(recalls, precisions)
plt.xlabel("Recall", fontsize=14)
plt.ylabel("Precision", fontsize=14)
ax.set_xlim(0, 1);

Receiver Operating Characteristic (ROC)

  • Binary classification
  • Plot the true positive rate (recall) against the false positive rate (FPR), or plots sensitivity (recall) versus 1-specificity
    • $FPR = 1 - TNR$, TNR is the true negative rate, called specificity
  • The higher the recall (TRP), the more false positives (FPR)
  • Choose classifiers by cmoparing the area under the curve (AUC)
  • Choose Precision-Recall curve whenever the positive class is rare or when you care more about the false positives than the false negatives; otherwise, use the ROC curve
  • ROC curves are appropriate when the observations are balanced between each class, whereas precision-recall curves are appropriate for imbalanced datasets
In [89]:
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(train_Y_5, scores)
roc_auc_score(train_Y_5, scores)
Out[89]:
0.9750614659975092
In [88]:
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

ax.plot(fpr, tpr)
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate (Recall)", fontsize=14)
ax.set_xlim(0, 1);
ax.set_ylim(0, 1);

Learning Curve

  • Regression, binary classification, multiple classification
  • Plot the model's performance on the training set and validation set as a function of the training set
  • Steps
    • train the model with a fraction of training set
    • test the same fraction of training set with the trained model
    • test the validation set with the trained model
    • repeat with the different size of the fraction of training set
  • Underfitting
    • both curves have reached a plateau, they are close and fairly high
  • Overfitting
    • the error on the training data is low
    • there is a gap between the two curves
In [102]:
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True)
In [103]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB();
In [121]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, train_sizes=np.linspace(.1, 1.0, 5), scoring='accuracy')
In [122]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
In [125]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(9, 6))
ax = fig.add_subplot(111)

line1, = ax.plot(train_sizes, train_scores_mean, 'ro-', label='Train')
line2, = ax.plot(train_sizes, test_scores_mean, 'go-', label='Validation')

ax.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
ax.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")

legend = ax.legend((line1, line2), ('Train', 'Validation'), shadow=True);
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("Accuracy", fontsize=14)
Out[125]:
Text(0, 0.5, 'Accuracy')