Feature Engineer

Encoding Categorical Values

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])

# Drop live projects
ks = ks.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

# Timestamp features
ks = ks.assign(hour=ks.launched.dt.hour,

# Label encoding
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)
In [2]:
import lightgbm as lgb
from sklearn import metrics

def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    return train, valid, test

def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    print("Training model!")
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], 
                    early_stopping_rounds=10, verbose_eval=False)

    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    return bst
In [3]:
# Training a model on the baseline data
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7467

Count Encoding

  • Count encoding replaces each categorical value with the number of times it appears in the dataset
In [4]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']
count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

data = baseline_data.join(count_encoded.add_suffix("_count"))

# Training a model on the baseline data
train, valid, test = get_data_splits(data)
bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7486
In [5]:
goal hour day month year outcome category currency country category_count currency_count country_count
0 1000.0 12 11 8 2015 0 108 5 9 1362 33853 33393
1 30000.0 4 2 9 2017 0 93 13 22 5174 293624 290887
2 45000.0 0 12 1 2013 0 93 13 22 5174 293624 290887
3 5000.0 3 17 3 2012 0 90 13 22 15647 293624 290887
4 19500.0 8 4 7 2015 0 55 13 22 10054 293624 290887
... ... ... ... ... ... ... ... ... ... ... ... ...
378656 50000.0 2 17 9 2014 0 39 13 22 16082 293624 290887
378657 1500.0 3 22 6 2011 0 93 13 22 5174 293624 290887
378658 15000.0 19 1 7 2010 0 93 13 22 5174 293624 290887
378659 15000.0 18 13 1 2016 0 138 13 22 6850 293624 290887
378660 2000.0 9 19 7 2011 0 98 13 22 2150 293624 290887

375862 rows × 12 columns

Target Encoding

  • Target encoding replaces a categorical value with the average value of the target for that value of the feature
In [6]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
target_enc = ce.TargetEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)

# Fit the encoder using the categorical features and target
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7491
In [9]:
goal hour day month year outcome category currency country category_count currency_count country_count category_target currency_target country_target
0 1000.0 12 11 8 2015 0 108 5 9 1362 33853 33393 0.360190 0.357122 0.361636
1 30000.0 4 2 9 2017 0 93 13 22 5174 293624 290887 0.384615 0.373392 0.376631
2 45000.0 0 12 1 2013 0 93 13 22 5174 293624 290887 0.384615 0.373392 0.376631
3 5000.0 3 17 3 2012 0 90 13 22 15647 293624 290887 0.412655 0.373392 0.376631
4 19500.0 8 4 7 2015 0 55 13 22 10054 293624 290887 0.302625 0.373392 0.376631
In [10]:
goal hour day month year outcome category currency country category_count currency_count country_count category_target currency_target country_target
302896 5000.0 16 12 6 2015 1 39 13 22 16082 293624 290887 0.369690 0.373392 0.376631
302897 3700.0 1 8 7 2013 1 54 13 22 9120 293624 290887 0.251262 0.373392 0.376631
302898 5500.0 22 27 5 2014 1 90 5 9 15647 33853 33393 0.412655 0.357122 0.361636
302899 25000.0 5 29 7 2014 0 136 13 22 14072 293624 290887 0.560610 0.373392 0.376631
302900 4800.0 13 5 8 2014 0 156 5 9 2089 33853 33393 0.445242 0.357122 0.361636

CatBoost Encoding

  • Based on the target probablity for a given value
In [11]:
cat_features = ['category', 'currency', 'country']
target_enc = ce.CatBoostEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)
target_enc.fit(train[cat_features], train['outcome'])

train = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))

bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7492