Preprocessing

In [5]:
import pandas as pd

data = pd.read_csv('housing.csv')
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

Data Cleaning

  • Get rid of the corresponding districts
  • Get rid of the whole attribute
  • Set the values to some value (zero, the mean, the median, etc.)
In [ ]:
median = data['total_bedrooms'].median()
data['total_bedrooms'].fillna(median, inplace=True)
In [69]:
import numpy as np
from sklearn.impute import SimpleImputer
data_num = data.drop('ocean_proximity', axis=1)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(data_num)
X = imp.transform(data_num)
data_tr = pd.DataFrame(X, columns=data_num.columns)
data_tr['ocean_proximity'] = data['ocean_proximity']
data_tr.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20640 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

Handling Text and Categorical Attributes

In [70]:
data_tr.head()
Out[70]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value ocean_proximity
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 NEAR BAY
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 NEAR BAY
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 NEAR BAY
In [66]:
data_tr['ocean_proximity'].value_counts()
Out[66]:
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64
In [101]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer();
data_cat_binary = encoder.fit_transform(data_tr['ocean_proximity']) # numpy array
data_cat_binary
Out[101]:
array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])
In [100]:
data_cat = pd.DataFrame(data_cat_binary, columns=encoder.classes_, index = data_tr.index)
data_cat = data_tr.join(data_cat)
data_cat = data_cat.drop('ocean_proximity', axis=1)
data_cat.head()
Out[100]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 0 0 0 1 0
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 0 0 0 1 0
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 0 0 0 1 0
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 0 0 0 1 0
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 0 0 0 1 0

Add More Features

In [105]:
def add_features(data):
    data_copy = data.copy()
    data_copy["rooms_per_household"] = data_copy["total_rooms"]/data_copy["households"]
    data_copy["bedrooms_per_room"] = data_copy["total_bedrooms"]/data_copy["total_rooms"]
    data_copy["population_per_household"]=data_copy["population"]/data_copy["households"]
    return data_copy

data_add = add_features(data_cat)
data_add.head()
Out[105]:
longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_value <1H OCEAN INLAND ISLAND NEAR BAY NEAR OCEAN rooms_per_household bedrooms_per_room population_per_household
0 -122.23 37.88 41.0 880.0 129.0 322.0 126.0 8.3252 452600.0 0 0 0 1 0 6.984127 0.146591 2.555556
1 -122.22 37.86 21.0 7099.0 1106.0 2401.0 1138.0 8.3014 358500.0 0 0 0 1 0 6.238137 0.155797 2.109842
2 -122.24 37.85 52.0 1467.0 190.0 496.0 177.0 7.2574 352100.0 0 0 0 1 0 8.288136 0.129516 2.802260
3 -122.25 37.85 52.0 1274.0 235.0 558.0 219.0 5.6431 341300.0 0 0 0 1 0 5.817352 0.184458 2.547945
4 -122.25 37.85 52.0 1627.0 280.0 565.0 259.0 3.8462 342200.0 0 0 0 1 0 6.281853 0.172096 2.181467

Feature Scaling

  • zero-mean normalization: $xi=(xi−mean(xi))/std(xi)$
    • So called standardization
    • Does not bound values to a specific range
    • Less affected by outliers
  • Min-max normalization: $xi=(xi−min(xi)/(max(xi)−min(xi))$
    • So called normalization
    • recommendation
    • range from 0 to 1
In [116]:
data_scale = data_add.copy()
In [113]:
# min-max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # fit the scaler to the training data only
scaler.fit(data_scale) # scale can be use to scale test data
data_scale = scaler.transform(data_scale)
data_scale.shape
Out[113]:
(20640, 17)
In [118]:
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_scale)
data_scale = scaler.transform(data_scale)
data_scale.shape
Out[118]:
(20640, 17)

Transform Non-Gaussion Distribution to Bell-Shaped Distribution

In [124]:
# check feature distribution
data_dist = pd.DataFrame(data_scale, columns=data_add.columns, index=data_add.index)
data_dist.hist(bins = 50, figsize = (20, 15))
Out[124]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1f33f358>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a21fabfd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d59a7b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5a6d30>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5b52e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5ce860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5f1dd8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d61c3c8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1d61c400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d66ae80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d69d438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d6c59b0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1dd31f28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1dd644e0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1dd8aa58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1ddb1fd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1dde2588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1de09b00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1de3b0b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1de64630>]],
      dtype=object)
In [138]:
# tranform tail heavy distribution to bell-shaped distribution
total_bedrooms = pd.DataFrame(data_dist['total_bedrooms'])
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
pt.fit(total_bedrooms)
bell = pt.transform(total_bedrooms)
In [135]:
import matplotlib.pyplot as plt
plt.hist(bell)
Out[135]:
(array([ 482.,  941., 2483., 4449., 5023., 3977., 2147.,  871.,  226.,
          41.]),
 array([-2.71495584, -2.09664046, -1.47832507, -0.86000969, -0.24169431,
         0.37662107,  0.99493646,  1.61325184,  2.23156722,  2.84988261,
         3.46819799]),
 <a list of 10 Patch objects>)

Polymonial Features

In [144]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
features = poly.fit_transform(X) # numpy array
features.shape
Out[144]:
(20640, 55)

Feature Selection

  • Univariate Feature Selection
  • L1 regularization