# Read Data
import pandas as pd
ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])
ks.head(10)
# Project States
pd.unique(ks.state) # six states
# List the Number of Samples in Each State
ks.groupby('state').count()
# Data Cleaning
# Drop live projects
ks = ks.query('state != "live"')
# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))
# convert launched feature into categorical features
ks = ks.assign(hour=ks.launched.dt.hour,
day=ks.launched.dt.day,
month=ks.launched.dt.month,
year=ks.launched.dt.year)
ks.head()
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
# Apply the label encoder to each column
encoded = ks[['category', 'currency', 'country']].apply(encoder.fit_transform)
# apply the encoder to each of the columns
encoded.head()
data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)
data.head()
from sklearn.utils import shuffle
data = shuffle(data)
valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)
train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]
for each in [train, valid, test]:
print(f"Outcome fraction = {each.outcome.mean():.4f}")
train_X = train.drop(['outcome'], axis = 1)
train_Y = train['outcome']
valid_X = valid.drop(['outcome'], axis = 1)
valid_Y = valid['outcome']
test_X = test.drop(['outcome'], axis = 1)
test_Y = test['outcome']
import lightgbm as lgb
dtrain = lgb.Dataset(train_X, label=train_Y)
dvalid = lgb.Dataset(valid_X, label=valid_Y)
param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)
from sklearn import metrics
ypred = bst.predict(test_X)
score = metrics.roc_auc_score(test_Y, ypred)
score