Metrics

In [2]:
import numpy as np
import pandas as pd

Binary-Class

  • by default that the positive class is labelled 1
In [5]:
y = np.loadtxt('temp_3.txt')
y_pred = np.loadtxt('temp_4.txt')
data = np.hstack((y.reshape(-1, 1), y_pred.reshape(-1, 1)))
In [6]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(data[:, 0], data[:, 1]) # True, Prediction
In [7]:
# each row corresponding to the true class
# each column corresponding to the prediction class
pd.DataFrame(cf_matrix, index = ['False', 'True'], columns = ['False', 'True'])
Out[7]:
False True
False 619 93
True 237 243
In [8]:
data[np.logical_and(data[:, 0] == 0, data[:, 1] == 0)].shape
Out[8]:
(619, 2)

TP = 243, FP = 93, FN = 237, TN = 619

  • Precision, 243 / (243+93) = 0.723214
  • Recall, 243 / (243+237) = 0.50625
  • F1-score, 20.720.51/(0.72+0.51) = 0.596

Accuracy

In [9]:
from sklearn.metrics import accuracy_score

print(accuracy_score(data[:, 0], data[:, 1])) # fraction
print(accuracy_score(data[:, 0], data[:, 1], normalize=False)) # correct prediction
0.7231543624161074
862

Precision

In [87]:
from sklearn.metrics import precision_score

# None, return the scores for each class
print(precision_score(data[:, 0], data[:, 1], average=None)) 

# Only report results for the class specified by pos_label, the default pos_label = 1
print(precision_score(data[:, 0], data[:, 1], average='binary')) 
[0.72313084 0.72321429]
0.7232142857142857

Recall

In [88]:
from sklearn.metrics import recall_score

# None, return the scores for each class
print(recall_score(data[:, 0], data[:, 1], average=None)) 

# Only report results for the class specified by pos_label, the default pos_label = 1
print(recall_score(data[:, 0], data[:, 1], average='binary'))
[0.86938202 0.50625   ]
0.50625

F1-score

In [90]:
from sklearn.metrics import f1_score

# None, return the scores for each class
print(f1_score(data[:, 0], data[:, 1], average=None)) 

# Only report results for the class specified by pos_label, the default pos_label = 1
print(f1_score(data[:, 0], data[:, 1], average='binary'))
[0.78954082 0.59558824]
0.5955882352941176

Multi-Class

In [34]:
y = np.loadtxt('temp_1.txt')
y_pred = np.loadtxt('temp_2.txt')
In [47]:
data = np.hstack((y.reshape(-1, 1), y_pred.reshape(-1, 1)))
In [49]:
data # Column 0, True; Column 1, Prediction
Out[49]:
array([[2., 2.],
       [1., 1.],
       [2., 2.],
       ...,
       [2., 0.],
       [2., 2.],
       [2., 2.]])
In [50]:
from sklearn.metrics import confusion_matrix
cf_matrix = confusion_matrix(data[:, 0], data[:, 1]) # True, Prediction
In [64]:
# each row corresponding to the true class
# each column corresponding to the prediction class
pd.DataFrame(cf_matrix, index = ['0', '1', '2'], columns = ['0', '1', '2'])
Out[64]:
0 1 2
0 277 16 187
1 28 22 121
2 131 26 384
In [63]:
data[np.logical_and(data[:, 0] == 2, data[:, 1] == 1)].shape
Out[63]:
(26, 2)

Class 0

  • TP = 277, FP = 28 + 131, FN = 16 + 187, TN = 22 + 121 + 26 + 384
  • Precision = 0.64, Recall = 0.58, F1-score = 0.60

Class 1

  • TP = 22, FP = 16 + 26, FN = 28 + 121, TN = 277 + 187 + 131 + 384
  • Precision = 0.34, Recall = 0.13, F1-score = 0.19

Class 2

  • TP = 384, FP = 187 + 121, FN = 131 + 26, TN = 277 + 16 + 28 + 22
  • Precision = 0.55, Recall = 0.71, F1-score = 0.62
In [54]:
from sklearn.metrics import classification_report
print(classification_report(data[:, 0], data[:, 1], target_names=['0', '1', '2']))
              precision    recall  f1-score   support

           0       0.64      0.58      0.60       480
           1       0.34      0.13      0.19       171
           2       0.55      0.71      0.62       541

    accuracy                           0.57      1192
   macro avg       0.51      0.47      0.47      1192
weighted avg       0.56      0.57      0.55      1192

Accuracy

$$accuracy(y, \hat{y}) = \frac{1}{n} \Sigma_{i=0}^{n-1}(y == \hat{y})$$
In [58]:
from sklearn.metrics import accuracy_score

print(accuracy_score(data[:, 0], data[:, 1])) # fraction
print(accuracy_score(data[:, 0], data[:, 1], normalize=False)) # correct prediction
0.572986577181208
683

Precision

$$precision = \frac{tp}{tp+fp}$$
In [76]:
from sklearn.metrics import precision_score

# None, return the scores for each class
print(precision_score(data[:, 0], data[:, 1], average=None)) 

# simply calculates the mean of the binary metrics, giving equal weight to each class
print(precision_score(data[:, 0], data[:, 1], average='macro')) # (0.64 + 0.34 + 0.55) / 3

# sums the dividends and divisors that make up the per-class metrics to calculate an overall quotient
print(precision_score(data[:, 0], data[:, 1], average='micro')) # (227+22+384)/(227+22+384+159+42+308)

# each class’s score is weighted by its presence in the true data sample
print(precision_score(data[:, 0], data[:, 1], average='weighted')) # 0.64*0.38+0.34*0.15+0.55*0.47
[0.6353211  0.34375    0.55491329]
0.5113281319050397
0.572986577181208
0.5569995561459027

Recall

$$recall = \frac{tp}{tp+fn}$$
In [77]:
from sklearn.metrics import recall_score

# None, return the scores for each class
print(recall_score(data[:, 0], data[:, 1], average=None)) 

# simply calculates the mean of the binary metrics, giving equal weight to each class
print(recall_score(data[:, 0], data[:, 1], average='macro'))

# sums the dividends and divisors that make up the per-class metrics to calculate an overall quotient
print(recall_score(data[:, 0], data[:, 1], average='micro'))

# each class’s score is weighted by its presence in the true data sample
print(recall_score(data[:, 0], data[:, 1], average='weighted'))
[0.57708333 0.12865497 0.70979667]
0.47184499230722105
0.572986577181208
0.572986577181208

F-measure

$$F-measure = \frac{2*Recall*Precision}{Recall+Precision}$$
In [79]:
from sklearn.metrics import f1_score

# None, return the scores for each class
print(f1_score(data[:, 0], data[:, 1], average=None)) 

# simply calculates the mean of the binary metrics, giving equal weight to each class
print(f1_score(data[:, 0], data[:, 1], average='macro'))

# sums the dividends and divisors that make up the per-class metrics to calculate an overall quotient
print(f1_score(data[:, 0], data[:, 1], average='micro'))

# each class’s score is weighted by its presence in the true data sample
print(f1_score(data[:, 0], data[:, 1], average='weighted'))
[0.60480349 0.18723404 0.62287105]
0.4716361940772278
0.572986577181208
0.553100615891127

Reference