Feature Engineer

Encoding Categorical Values

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])

# Drop live projects
ks = ks.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

# Timestamp features
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)

# Label encoding
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)
In [2]:
import lightgbm as lgb
from sklearn import metrics

def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    print("Training model!")
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], 
                    early_stopping_rounds=10, verbose_eval=False)

    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    return bst
/opt/anaconda3/lib/python3.7/site-packages/lightgbm/__init__.py:48: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  "You can install the OpenMP library by the following command: ``brew install libomp``.", UserWarning)
In [3]:
# Training a model on the baseline data
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7467

Count Encoding

  • Count encoding replaces each categorical value with the number of times it appears in the dataset
In [4]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']
count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

data = baseline_data.join(count_encoded.add_suffix("_count"))

# Training a model on the baseline data
train, valid, test = get_data_splits(data)
bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7486
In [5]:
data
Out[5]:
goal hour day month year outcome category currency country category_count currency_count country_count
0 1000.0 12 11 8 2015 0 108 5 9 1362 33853 33393
1 30000.0 4 2 9 2017 0 93 13 22 5174 293624 290887
2 45000.0 0 12 1 2013 0 93 13 22 5174 293624 290887
3 5000.0 3 17 3 2012 0 90 13 22 15647 293624 290887
4 19500.0 8 4 7 2015 0 55 13 22 10054 293624 290887
... ... ... ... ... ... ... ... ... ... ... ... ...
378656 50000.0 2 17 9 2014 0 39 13 22 16082 293624 290887
378657 1500.0 3 22 6 2011 0 93 13 22 5174 293624 290887
378658 15000.0 19 1 7 2010 0 93 13 22 5174 293624 290887
378659 15000.0 18 13 1 2016 0 138 13 22 6850 293624 290887
378660 2000.0 9 19 7 2011 0 98 13 22 2150 293624 290887

375862 rows × 12 columns

Target Encoding

  • Target encoding replaces a categorical value with the average value of the target for that value of the feature
In [6]:
import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
target_enc = ce.TargetEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)

# Fit the encoder using the categorical features and target
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

train.head()
bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7491
In [9]:
train.head()
Out[9]:
goal hour day month year outcome category currency country category_count currency_count country_count category_target currency_target country_target
0 1000.0 12 11 8 2015 0 108 5 9 1362 33853 33393 0.360190 0.357122 0.361636
1 30000.0 4 2 9 2017 0 93 13 22 5174 293624 290887 0.384615 0.373392 0.376631
2 45000.0 0 12 1 2013 0 93 13 22 5174 293624 290887 0.384615 0.373392 0.376631
3 5000.0 3 17 3 2012 0 90 13 22 15647 293624 290887 0.412655 0.373392 0.376631
4 19500.0 8 4 7 2015 0 55 13 22 10054 293624 290887 0.302625 0.373392 0.376631
In [10]:
valid.head()
Out[10]:
goal hour day month year outcome category currency country category_count currency_count country_count category_target currency_target country_target
302896 5000.0 16 12 6 2015 1 39 13 22 16082 293624 290887 0.369690 0.373392 0.376631
302897 3700.0 1 8 7 2013 1 54 13 22 9120 293624 290887 0.251262 0.373392 0.376631
302898 5500.0 22 27 5 2014 1 90 5 9 15647 33853 33393 0.412655 0.357122 0.361636
302899 25000.0 5 29 7 2014 0 136 13 22 14072 293624 290887 0.560610 0.373392 0.376631
302900 4800.0 13 5 8 2014 0 156 5 9 2089 33853 33393 0.445242 0.357122 0.361636

CatBoost Encoding

  • Based on the target probablity for a given value
In [11]:
cat_features = ['category', 'currency', 'country']
target_enc = ce.CatBoostEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)
target_enc.fit(train[cat_features], train['outcome'])

train = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))

bst = train_model(train, valid)
Training model!
Validation AUC score: 0.7492