Preprocessing¶

import pandas as pd

data = pd.read_csv('housing.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20433 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

Data Cleaning¶

Get rid of the corresponding districts
Get rid of the whole attribute
Set the values to some value (zero, the mean, the median, etc.)

median = data['total_bedrooms'].median()
data['total_bedrooms'].fillna(median, inplace=True)

import numpy as np
from sklearn.impute import SimpleImputer
data_num = data.drop('ocean_proximity', axis=1)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(data_num)
X = imp.transform(data_num)
data_tr = pd.DataFrame(X, columns=data_num.columns)
data_tr['ocean_proximity'] = data['ocean_proximity']
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
longitude             20640 non-null float64
latitude              20640 non-null float64
housing_median_age    20640 non-null float64
total_rooms           20640 non-null float64
total_bedrooms        20640 non-null float64
population            20640 non-null float64
households            20640 non-null float64
median_income         20640 non-null float64
median_house_value    20640 non-null float64
ocean_proximity       20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

Handling Text and Categorical Attributes¶

data_tr.head()

data_tr['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer();
data_cat_binary = encoder.fit_transform(data_tr['ocean_proximity']) # numpy array
data_cat_binary

array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0]])

data_cat = pd.DataFrame(data_cat_binary, columns=encoder.classes_, index = data_tr.index)
data_cat = data_tr.join(data_cat)
data_cat = data_cat.drop('ocean_proximity', axis=1)
data_cat.head()

Add More Features¶

def add_features(data):
    data_copy = data.copy()
    data_copy["rooms_per_household"] = data_copy["total_rooms"]/data_copy["households"]
    data_copy["bedrooms_per_room"] = data_copy["total_bedrooms"]/data_copy["total_rooms"]
    data_copy["population_per_household"]=data_copy["population"]/data_copy["households"]
    return data_copy

data_add = add_features(data_cat)
data_add.head()

Feature Scaling¶

zero-mean normalization: $xi=(xi−mean(xi))/std(xi)$
- So called standardization
- Does not bound values to a specific range
- Less affected by outliers
Min-max normalization: $xi=(xi−min(xi)/(max(xi)−min(xi))$
- So called normalization
- recommendation
- range from 0 to 1

data_scale = data_add.copy()

# min-max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # fit the scaler to the training data only
scaler.fit(data_scale) # scale can be use to scale test data
data_scale = scaler.transform(data_scale)
data_scale.shape

(20640, 17)

# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_scale)
data_scale = scaler.transform(data_scale)
data_scale.shape

(20640, 17)

Transform Non-Gaussion Distribution to Bell-Shaped Distribution¶

Map data to a normal distribution

# check feature distribution
data_dist = pd.DataFrame(data_scale, columns=data_add.columns, index=data_add.index)
data_dist.hist(bins = 50, figsize = (20, 15))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1f33f358>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a21fabfd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d59a7b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5a6d30>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5b52e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5ce860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d5f1dd8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d61c3c8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1d61c400>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d66ae80>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d69d438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1d6c59b0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1dd31f28>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1dd644e0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1dd8aa58>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1ddb1fd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a1dde2588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1de09b00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1de3b0b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a1de64630>]],
      dtype=object)

# tranform tail heavy distribution to bell-shaped distribution
total_bedrooms = pd.DataFrame(data_dist['total_bedrooms'])
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
pt.fit(total_bedrooms)
bell = pt.transform(total_bedrooms)

import matplotlib.pyplot as plt
plt.hist(bell)

(array([ 482.,  941., 2483., 4449., 5023., 3977., 2147.,  871.,  226.,
          41.]),
 array([-2.71495584, -2.09664046, -1.47832507, -0.86000969, -0.24169431,
         0.37662107,  0.99493646,  1.61325184,  2.23156722,  2.84988261,
         3.46819799]),
 <a list of 10 Patch objects>)

Polymonial Features¶

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
features = poly.fit_transform(X) # numpy array
features.shape

(20640, 55)

Feature Selection¶

Univariate Feature Selection
L1 regularization

Reference¶

Scale, Standardize, or Normalize with Scikit-Learn</li>

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	NEAR BAY

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	NEAR BAY
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	1
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	1
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	1
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	1
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	1

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value	NEAR BAY	rooms_per_household	bedrooms_per_room	population_per_household
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	452600.0	1	6.984127	0.146591	2.555556
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	358500.0	1	6.238137	0.155797	2.109842
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	352100.0	1	8.288136	0.129516	2.802260
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	341300.0	1	5.817352	0.184458	2.547945
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	342200.0	1	6.281853	0.172096	2.181467