import pandas as pd
import numpy as np
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"
data = pd.read_csv(url, header=None)
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
mod_data['class_test'].value_counts()
x_data = mod_data.iloc[:,:-2].values # just the X
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class
def fit_PU_estimator(X,y, hold_out_ratio, estimator):
# find the indices of the positive/labeled elements
assert (type(y) == np.ndarray), "Must pass np.ndarray rather than list as y"
positives = np.where(y == 1.)[0]
# hold_out_size = the *number* of positives/labeled samples
# that we will use later to estimate P(s=1|y=1)
hold_out_size = int(np.ceil(len(positives) * hold_out_ratio))
np.random.shuffle(positives)
# hold_out = the *indices* of the positive elements
# that we will later use to estimate P(s=1|y=1)
hold_out = positives[:hold_out_size]
# the actual positive *elements* that we will keep aside
X_hold_out = X[hold_out]
# remove the held out elements from X and y
X = np.delete(X, hold_out,0)
y = np.delete(y, hold_out)
# We fit the estimator on the unlabeled samples + (part of the) positive and labeled ones.
# In order to estimate P(s=1|X) or what is the probablity that an element is *labeled*
estimator.fit(X, y)
# We then use the estimator for prediction of the positive held-out set
# in order to estimate P(s=1|y=1)
hold_out_predictions = estimator.predict_proba(X_hold_out)
#take the probability that it is 1
hold_out_predictions = hold_out_predictions[:,1]
# save the mean probability
c = np.mean(hold_out_predictions)
return estimator, c
def predict_PU_prob(X, estimator, prob_s1y1):
predicted_s = estimator.predict_proba(X)
predicted_s = predicted_s[:,1]
return predicted_s / prob_s1y1
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score
def evaluate_results(y_test, y_predict):
print('Classification results:')
f1 = f1_score(y_test, y_predict)
print("f1: %.2f%%" % (f1 * 100.0))
roc = roc_auc_score(y_test, y_predict)
print("roc: %.2f%%" % (roc * 100.0))
rec = recall_score(y_test, y_predict, average='binary')
print("recall: %.2f%%" % (rec * 100.0))
prc = precision_score(y_test, y_predict, average='binary')
print("precision: %.2f%%" % (prc * 100.0))
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
predicted = np.zeros(len(x_data))
learning_iterations = 24
for index in range(learning_iterations):
#pu_estimator, probs1y1 = fit_PU_estimator(x_data, y_labeled, 0.2, RandomForestClassifier())
pu_estimator, probs1y1 = fit_PU_estimator(x_data, y_labeled, 0.2, xgb.XGBClassifier())
predicted += predict_PU_prob(x_data, pu_estimator, probs1y1)
if(index%4 == 0):
print(f'Learning Iteration::{index}/{learning_iterations} => P(s=1|y=1)={round(probs1y1,2)}')
y_predict = [1 if x > 0.1 else 0 for x in (predicted/learning_iterations)]
print(classification_report(y_positive, y_predict, target_names=['class 0', 'class 1']))
precision recall f1-score support class 0 0.93 1.00 0.96 762 class 1 0.99 0.91 0.95 610 accuracy 0.96 1372 macro avg 0.96 0.95 0.96 1372 weighted avg 0.96 0.96 0.96 1372
evaluate_results(y_positive, y_predict)
Classification results: f1: 95.03% roc: 95.29% recall: 90.98% precision: 99.46%