California Housing Price

1. Prepare Training Set and Test Set

In [51]:
import pandas as pd
# Read Data
data = pd.read_csv('housing.csv')

# Create Training Set and Test Set

from sklearn.model_selection import train_test_split
# train_test_split
# test_size, ratio of test set
# train_size, ratio of trainning set
# random_state, random seed
# shuffle, whether or not to shuffle the data before splitting
# stratify, data is split in a stratified fashion

import numpy as np
data_X = data.drop(['median_house_value'], axis = 1) # DataFrame
data_Y = data['median_house_value'] # Series

# split the dataset by categories of median income
data['income_cat'] = np.ceil(data['median_income']/1.5)
data['income_cat'].where(data['income_cat'] < 5, 5, inplace=True)

train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data['income_cat'])

2. Prepare the Data for Machine Learning Algorithms

In [56]:
# Custom Transformer, add three features

from sklearn.base import BaseEstimator, TransformerMixin

class AddAttributes(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        X["rooms_per_household"] = X["total_rooms"]/X["households"]
        X["bedrooms_per_room"] = X["total_bedrooms"]/X["total_rooms"]
        X["population_per_household"]=X["population"]/X["households"]
        return X
In [57]:
# Select specific columns
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]
In [58]:
# Create Pipeline for Numeric Columns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income',])),
    ('attribs_adder', AddAttributes()),
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

#train_X_tr = num_pipeline.fit_transform(train_X)
In [59]:
# Create Pipeline for Categorical Columns
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(['ocean_proximity'])),
    ('cat', OneHotEncoder()),
])

#data_cat_tr = cat_pipeline.fit_transform(train_X)
In [60]:
# Merge Numeric Columns and Categorical Columns
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
In [61]:
train_X = preprocess_pipeline.fit_transform(train_X).toarray()
In [62]:
test_X = preprocess_pipeline.transform(test_X).toarray()
In [63]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [{'n_estimators': [40, 50, 100], 'max_features': [2, 4, 6, 8, 10, 12]}]

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

grid_search.fit(train_X, train_Y)
Out[63]:
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8, 10, 12],
                          'n_estimators': [40, 50, 100]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)
In [64]:
grid_search.best_params_ # best combination of parameters
Out[64]:
{'max_features': 8, 'n_estimators': 100}
In [65]:
grid_search.best_estimator_ # best model
# If GridSearchCV is initialized with refit=Ture (which is the default), 
# then once it finds the best estimator using cross-validation, it retrains it on the whole training set
Out[65]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features=8, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
In [66]:
# evaluation scores
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
52297.36276759568 {'max_features': 2, 'n_estimators': 40}
52306.63395851931 {'max_features': 2, 'n_estimators': 50}
51956.10854795302 {'max_features': 2, 'n_estimators': 100}
50273.38718954273 {'max_features': 4, 'n_estimators': 40}
49990.40837322438 {'max_features': 4, 'n_estimators': 50}
49733.52619831834 {'max_features': 4, 'n_estimators': 100}
49460.29499662624 {'max_features': 6, 'n_estimators': 40}
49339.20002501062 {'max_features': 6, 'n_estimators': 50}
49257.10026079575 {'max_features': 6, 'n_estimators': 100}
49905.83478199433 {'max_features': 8, 'n_estimators': 40}
49633.64757286196 {'max_features': 8, 'n_estimators': 50}
49206.19383042872 {'max_features': 8, 'n_estimators': 100}
50056.82235234841 {'max_features': 10, 'n_estimators': 40}
49719.48008406768 {'max_features': 10, 'n_estimators': 50}
49432.86783507407 {'max_features': 10, 'n_estimators': 100}
50351.29073374336 {'max_features': 12, 'n_estimators': 40}
50149.330986091576 {'max_features': 12, 'n_estimators': 50}
49596.574047315684 {'max_features': 12, 'n_estimators': 100}

4. Evaluate System on the Test Set

In [67]:
# Evaluate test dataset
from sklearn.metrics import mean_squared_error

final_model = grid_search.best_estimator_

predictions = final_model.predict(test_X)

MSE = mean_squared_error(test_Y, predictions)
RMSE = np.sqrt(MSE)
RMSE
Out[67]:
47177.70938759043
In [68]:
# 95% confidence interval for the generalizaiton error
from scipy import stats

confidence = 0.95
squared_errors = (predictions - test_Y) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))
Out[68]:
array([45182.66477592, 49091.74400922])