import pandas as pd
import numpy as np

data = pd.read_csv('housing.csv')
data = data.drop('ocean_proximity', axis=1) # 20640*9


data.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # fit the scaler to the training data only
data_min_max = scaler.fit_transform(data) # scale can be use to scale test data


data_min_max = pd.DataFrame(data_min_max, columns = data.columns)
data_min_max.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_standarization = scaler.fit_transform(data)


data_standarization = pd.DataFrame(data_standarization, columns = data.columns)
data_standarization.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
data_abs_max = scaler.fit_transform(data)


data_abs_max = pd.DataFrame(data_abs_max, columns = data.columns)
data_abs_max.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
data_robust = scaler.fit_transform(data)


data_robust = pd.DataFrame(data_robust, columns = data.columns)
data_robust.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


from sklearn.preprocessing import QuantileTransformer
scaler = QuantileTransformer(output_distribution='uniform') # default output distribution is uniform
data_quantile = scaler.fit_transform(data)


data_quantile = pd.DataFrame(data_quantile, columns = data.columns)
data_quantile.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


scaler = QuantileTransformer(output_distribution='normal')
data_quantile = scaler.fit_transform(data)


data_quantile = pd.DataFrame(data_quantile, columns = data.columns)
data_quantile.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method='yeo-johnson') # default method is yeo-johnson
data_power = scaler.fit_transform(data)


data_power = pd.DataFrame(data_power, columns = data.columns)
data_power.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


data_unit_vector = data.apply(lambda x: x/np.linalg.norm(x, 2))


data_unit_vector = pd.DataFrame(data_unit_vector, columns = data.columns)
data_unit_vector.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'total_bedrooms'}>,
        <AxesSubplot:title={'center':'population'}>],
       [<AxesSubplot:title={'center':'households'}>,
        <AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>]],
      dtype=object)


data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values


from sklearn.preprocessing import Normalizer
scaler = Normalizer() # default norm is l2
data_unit_norm = scaler.fit_transform(data_temp)


data_unit_norm = pd.DataFrame(data_unit_norm, columns = data_temp.columns)
data_unit_norm.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'population'}>,
        <AxesSubplot:title={'center':'households'}>],
       [<AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>,
        <AxesSubplot:>]], dtype=object)


data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values


from sklearn.preprocessing import KBinsDiscretizer
transformer = KBinsDiscretizer(n_bins = 5, encode = 'onehot-dense', strategy='uniform')
data_bin = transformer.fit_transform(data_temp)


data_bin = pd.DataFrame(data_bin, columns = data_temp.columns)
data_bin.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'population'}>,
        <AxesSubplot:title={'center':'households'}>],
       [<AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>,
        <AxesSubplot:>]], dtype=object)


data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values


from sklearn.preprocessing import Binarizer
transformer = Binarizer( threshold = 20)
data_binary = transformer.fit_transform(data_temp)


data_binary = pd.DataFrame(data_binary, columns = data_temp.columns)
data_binary.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'population'}>,
        <AxesSubplot:title={'center':'households'}>],
       [<AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>,
        <AxesSubplot:>]], dtype=object)


data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values


def transform_f(x):
    return x*10


from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(transform_f, validate=True)
data_function = transformer.fit_transform(data_temp)


data_function = pd.DataFrame(data_function, columns = data_temp.columns)
data_function.hist(bins = 50, figsize = (20, 15))

array([[<AxesSubplot:title={'center':'longitude'}>,
        <AxesSubplot:title={'center':'latitude'}>,
        <AxesSubplot:title={'center':'housing_median_age'}>],
       [<AxesSubplot:title={'center':'total_rooms'}>,
        <AxesSubplot:title={'center':'population'}>,
        <AxesSubplot:title={'center':'households'}>],
       [<AxesSubplot:title={'center':'median_income'}>,
        <AxesSubplot:title={'center':'median_house_value'}>,
        <AxesSubplot:>]], dtype=object)

Scaler and Transformer¶

Why Scaling?¶

Scaling Sensitive Models¶

Scaling Insensitive Models¶

Load Raw Data¶

Normalization¶

Standardization (Zero-mean normalization)¶

Max Abs Scaler¶

Robust Scaler¶

Quantile Transformer Scaler (Rank scaler)¶

Power Transformer Scaler¶

Unit Vector Scaler¶

Unit Norm Scaler¶

K-Bins Discretizations¶

Feature Binarization¶

Function Transformers¶

Reference¶