Kickstarter Projects¶

Predict if a user will download an app after clicking through an ad¶

Develop a baseline model for comparing performance on models with more features
Encode categorical features so the model can make better use of the information
Generate new features to provide more information for the model
Select features to reduce overfitting and increase prediction speed

# Read Data
import pandas as pd
ks = pd.read_csv('ks-projects-201801.csv', parse_dates=['deadline', 'launched'])
ks.head(10)

# Project States
pd.unique(ks.state) # six states

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

# List the Number of Samples in Each State
ks.groupby('state').count()

# Data Cleaning
# Drop live projects
ks = ks.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
ks = ks.assign(outcome=(ks['state'] == 'successful').astype(int))

# convert launched feature into categorical features
ks = ks.assign(hour=ks.launched.dt.hour,
               day=ks.launched.dt.day,
               month=ks.launched.dt.month,
               year=ks.launched.dt.year)

ks.head()

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# Apply the label encoder to each column

encoded = ks[['category', 'currency', 'country']].apply(encoder.fit_transform) 
# apply the encoder to each of the columns

encoded.head()

data = ks[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)
data.head()

from sklearn.utils import shuffle
data = shuffle(data)

valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]

for each in [train, valid, test]:
    print(f"Outcome fraction = {each.outcome.mean():.4f}")

Outcome fraction = 0.3565
Outcome fraction = 0.3562
Outcome fraction = 0.3556

train_X = train.drop(['outcome'], axis = 1)
train_Y = train['outcome']

valid_X = valid.drop(['outcome'], axis = 1)
valid_Y = valid['outcome']

test_X = test.drop(['outcome'], axis = 1)
test_Y = test['outcome']

import lightgbm as lgb

dtrain = lgb.Dataset(train_X, label=train_Y)
dvalid = lgb.Dataset(valid_X, label=valid_Y)

param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=10, verbose_eval=False)

from sklearn import metrics

ypred = bst.predict(test_X)
score = metrics.roc_auc_score(test_Y, ypred)
score

0.7477815752109231

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	state	backers	country	usd pledged	usd_pledged_real	usd_goal_real
0	1000002330	The Songs of Adelaide & Abullah	Poetry	Publishing	GBP	2015-10-09	1000.0	2015-08-11 12:12:28	0.00	failed	0	GB	0.00	0.00	1533.95
1	1000003930	Greeting From Earth: ZGAC Arts Capsule For ET	Narrative Film	Film & Video	USD	2017-11-01	30000.0	2017-09-02 04:43:57	2421.00	failed	15	US	100.00	2421.00	30000.00
2	1000004038	Where is Hank?	Narrative Film	Film & Video	USD	2013-02-26	45000.0	2013-01-12 00:20:50	220.00	failed	3	US	220.00	220.00	45000.00
3	1000007540	ToshiCapital Rekordz Needs Help to Complete Album	Music	Music	USD	2012-04-16	5000.0	2012-03-17 03:24:11	1.00	failed	1	US	1.00	1.00	5000.00
4	1000011046	Community Film Project: The Art of Neighborhoo...	Film & Video	Film & Video	USD	2015-08-29	19500.0	2015-07-04 08:35:03	1283.00	canceled	14	US	1283.00	1283.00	19500.00
5	1000014025	Monarch Espresso Bar	Restaurants	Food	USD	2016-04-01	50000.0	2016-02-26 13:38:27	52375.00	successful	224	US	52375.00	52375.00	50000.00
6	1000023410	Support Solar Roasted Coffee & Green Energy! ...	Food	Food	USD	2014-12-21	1000.0	2014-12-01 18:30:44	1205.00	successful	16	US	1205.00	1205.00	1000.00
7	1000030581	Chaser Strips. Our Strips make Shots their B*tch!	Drinks	Food	USD	2016-03-17	25000.0	2016-02-01 20:05:12	453.00	failed	40	US	453.00	453.00	25000.00
8	1000034518	SPIN - Premium Retractable In-Ear Headphones w...	Product Design	Design	USD	2014-05-29	125000.0	2014-04-24 18:14:43	8233.00	canceled	58	US	8233.00	8233.00	125000.00
9	100004195	STUDIO IN THE SKY - A Documentary Feature Film...	Documentary	Film & Video	USD	2014-08-10	65000.0	2014-07-11 21:55:48	6240.57	canceled	43	US	6240.57	6240.57	65000.00

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	backers	country	usd pledged	usd_pledged_real	usd_goal_real
state
canceled	38779	38779	38779	38779	38779	38779	38779	38779	38779	38779	38779	38757	38779	38779
failed	197719	197716	197719	197719	197719	197719	197719	197719	197719	197719	197719	197614	197719	197719
live	2799	2799	2799	2799	2799	2799	2799	2799	2799	2799	2799	2798	2799	2799
successful	133956	133956	133956	133956	133956	133956	133956	133956	133956	133956	133956	133851	133956	133956
suspended	1846	1845	1846	1846	1846	1846	1846	1846	1846	1846	1846	1844	1846	1846
undefined	3562	3562	3562	3562	3562	3562	3562	3562	3562	3562	3562	0	3562	3562

	ID	name	category	main_category	currency	deadline	goal	launched	pledged	state	backers	country	usd pledged	usd_pledged_real	usd_goal_real	hour	day	month	year
0	1000002330	The Songs of Adelaide & Abullah	Poetry	Publishing	GBP	2015-10-09	1000.0	2015-08-11 12:12:28	0.0	failed	0	GB	0.0	0.0	1533.95	12	11	8	2015
1	1000003930	Greeting From Earth: ZGAC Arts Capsule For ET	Narrative Film	Film & Video	USD	2017-11-01	30000.0	2017-09-02 04:43:57	2421.0	failed	15	US	100.0	2421.0	30000.00	4	2	9	2017
2	1000004038	Where is Hank?	Narrative Film	Film & Video	USD	2013-02-26	45000.0	2013-01-12 00:20:50	220.0	failed	3	US	220.0	220.0	45000.00	0	12	1	2013
3	1000007540	ToshiCapital Rekordz Needs Help to Complete Album	Music	Music	USD	2012-04-16	5000.0	2012-03-17 03:24:11	1.0	failed	1	US	1.0	1.0	5000.00	3	17	3	2012
4	1000011046	Community Film Project: The Art of Neighborhoo...	Film & Video	Film & Video	USD	2015-08-29	19500.0	2015-07-04 08:35:03	1283.0	canceled	14	US	1283.0	1283.0	19500.00	8	4	7	2015

	category	currency	country
0	108	5	9
1	93	13	22
2	93	13	22
3	90	13	22
4	55	13	22

	goal	hour	day	month	year	category	currency	country
0	1000.0	12	11	8	2015	108	5	9
1	30000.0	4	2	9	2017	93	13	22
2	45000.0	0	12	1	2013	93	13	22
3	5000.0	3	17	3	2012	90	13	22
4	19500.0	8	4	7	2015	55	13	22