Semi-supervised Learning (PU Learning)
¶

In [ ]:
import pandas as pd
import numpy as np
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"
data = pd.read_csv(url, header=None)

1. Create a data set¶

  • containing a small set of labeled data, labeled as 1
  • the rest records are unlabeled data, labeled as -1
In [ ]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
In [ ]:
mod_data['class_test'].value_counts()
In [ ]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

2. Define training function¶

In [ ]:
def fit_PU_estimator(X,y, hold_out_ratio, estimator):

    # find the indices of the positive/labeled elements
    assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
    positives = np.where(y == 1.)[0] 
    
    # hold_out_size = the *number* of positives/labeled samples 
    # that we will use later to estimate P(s=1|y=1)
    hold_out_size = int(np.ceil(len(positives) * hold_out_ratio))
    np.random.shuffle(positives)
    # hold_out = the *indices* of the positive elements 
    # that we will later use  to estimate P(s=1|y=1)
    hold_out = positives[:hold_out_size]  
    # the actual positive *elements* that we will keep aside
    X_hold_out = X[hold_out] 
    
    # remove the held out elements from X and y
    X = np.delete(X, hold_out,0) 
    y = np.delete(y, hold_out)
    
    # We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones.
    # In order to estimate P(s=1|X) or  what is the probablity that an element is *labeled*
    estimator.fit(X, y)
    
    # We then use the estimator for prediction of the positive held-out set 
    # in order to estimate P(s=1|y=1)
    hold_out_predictions = estimator.predict_proba(X_hold_out)
    
    #take the probability that it is 1
    hold_out_predictions = hold_out_predictions[:,1]
    
    # save the mean probability 
    c = np.mean(hold_out_predictions)
    return estimator, c
In [ ]:
def predict_PU_prob(X, estimator, prob_s1y1):
    predicted_s = estimator.predict_proba(X)
    predicted_s = predicted_s[:,1]
    return predicted_s / prob_s1y1
In [ ]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

3. Predict the probability of being True for the created data set¶

In [ ]:
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
In [ ]:
predicted = np.zeros(len(x_data))
learning_iterations = 24
for index in range(learning_iterations):
    #pu_estimator, probs1y1 = fit_PU_estimator(x_data, y_labeled, 0.2, RandomForestClassifier())
    pu_estimator, probs1y1 = fit_PU_estimator(x_data, y_labeled, 0.2, xgb.XGBClassifier())
    predicted += predict_PU_prob(x_data, pu_estimator, probs1y1)
    if(index%4 == 0): 
        print(f'Learning Iteration::{index}/{learning_iterations} => P(s=1|y=1)={round(probs1y1,2)}')
In [ ]:
y_predict = [1 if x > 0.1 else 0 for x in (predicted/learning_iterations)]

4. Evaluate performance¶

In [165]:
print(classification_report(y_positive, y_predict, target_names=['class 0', 'class 1']))
              precision    recall  f1-score   support

     class 0       0.93      1.00      0.96       762
     class 1       0.99      0.91      0.95       610

    accuracy                           0.96      1372
   macro avg       0.96      0.95      0.96      1372
weighted avg       0.96      0.96      0.96      1372

In [166]:
evaluate_results(y_positive, y_predict)
Classification results:
f1: 95.03%
roc: 95.29%
recall: 90.98%
precision: 99.46%

Reference¶

  • Semi-Supervised Classification of Unlabeled Data (PU Learning)
  • Notebook