Feature Engineer¶

Encoding Categorical Values¶

import pandas as pd
from sklearn.preprocessing import LabelEncoder

ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])

# Drop live projects
ks = ks.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

# Timestamp features
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)

# Label encoding
cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()
encoded = ks[cat_features].apply(encoder.fit_transform)

data_cols = ['goal', 'hour', 'day', 'month', 'year', 'outcome']
baseline_data = ks[data_cols].join(encoded)

import lightgbm as lgb
from sklearn import metrics

def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    print("Training model!")
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], 
                    early_stopping_rounds=10, verbose_eval=False)

    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    return bst

/opt/anaconda3/lib/python3.7/site-packages/lightgbm/__init__.py:48: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  "You can install the OpenMP library by the following command: ``brew install libomp``.", UserWarning)

# Training a model on the baseline data
train, valid, _ = get_data_splits(baseline_data)
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7467

Count Encoding¶

Count encoding replaces each categorical value with the number of times it appears in the dataset

import category_encoders as ce
cat_features = ['category', 'currency', 'country']
count_enc = ce.CountEncoder()
count_encoded = count_enc.fit_transform(ks[cat_features])

data = baseline_data.join(count_encoded.add_suffix("_count"))

# Training a model on the baseline data
train, valid, test = get_data_splits(data)
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7486

data

Target Encoding¶

Target encoding replaces a categorical value with the average value of the target for that value of the feature

import category_encoders as ce
cat_features = ['category', 'currency', 'country']

# Create the encoder itself
target_enc = ce.TargetEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)

# Fit the encoder using the categorical features and target
target_enc.fit(train[cat_features], train['outcome'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train = train.join(target_enc.transform(train[cat_features]).add_suffix('_target'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_target'))

train.head()
bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7491

train.head()

valid.head()

CatBoost Encoding¶

Based on the target probablity for a given value

cat_features = ['category', 'currency', 'country']
target_enc = ce.CatBoostEncoder(cols=cat_features)

train, valid, _ = get_data_splits(data)
target_enc.fit(train[cat_features], train['outcome'])

train = train.join(target_enc.transform(train[cat_features]).add_suffix('_cb'))
valid = valid.join(target_enc.transform(valid[cat_features]).add_suffix('_cb'))

bst = train_model(train, valid)

Training model!
Validation AUC score: 0.7492

	goal	hour	day	month	year	outcome	category	currency	country	category_count	currency_count	country_count
0	1000.0	12	11	8	2015	0	108	5	9	1362	33853	33393
1	30000.0	4	2	9	2017	0	93	13	22	5174	293624	290887
2	45000.0	0	12	1	2013	0	93	13	22	5174	293624	290887
3	5000.0	3	17	3	2012	0	90	13	22	15647	293624	290887
4	19500.0	8	4	7	2015	0	55	13	22	10054	293624	290887
...	...	...	...	...	...	...	...	...	...	...	...	...
378656	50000.0	2	17	9	2014	0	39	13	22	16082	293624	290887
378657	1500.0	3	22	6	2011	0	93	13	22	5174	293624	290887
378658	15000.0	19	1	7	2010	0	93	13	22	5174	293624	290887
378659	15000.0	18	13	1	2016	0	138	13	22	6850	293624	290887
378660	2000.0	9	19	7	2011	0	98	13	22	2150	293624	290887

	goal	hour	day	month	year	category	currency	country	category_count	currency_count	country_count	category_target	currency_target	country_target
0	1000.0	12	11	8	2015	108	5	9	1362	33853	33393	0.360190	0.357122	0.361636
1	30000.0	4	2	9	2017	93	13	22	5174	293624	290887	0.384615	0.373392	0.376631
2	45000.0	0	12	1	2013	93	13	22	5174	293624	290887	0.384615	0.373392	0.376631
3	5000.0	3	17	3	2012	90	13	22	15647	293624	290887	0.412655	0.373392	0.376631
4	19500.0	8	4	7	2015	55	13	22	10054	293624	290887	0.302625	0.373392	0.376631

	goal	hour	day	month	year	outcome	category	currency	country	category_count	currency_count	country_count	category_target	currency_target	country_target
302896	5000.0	16	12	6	2015	1	39	13	22	16082	293624	290887	0.369690	0.373392	0.376631
302897	3700.0	1	8	7	2013	1	54	13	22	9120	293624	290887	0.251262	0.373392	0.376631
302898	5500.0	22	27	5	2014	1	90	5	9	15647	33853	33393	0.412655	0.357122	0.361636
302899	25000.0	5	29	7	2014	0	136	13	22	14072	293624	290887	0.560610	0.373392	0.376631
302900	4800.0	13	5	8	2014	0	156	5	9	2089	33853	33393	0.445242	0.357122	0.361636