import pandas as pd
import numpy as np
data = pd.read_csv('housing.csv')
data = data.drop('ocean_proximity', axis=1) # 20640*9
data.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
Pros:
* All features will have the exact same scale
* Preserves the relationships among the original data values
* End up with smaller standard deviations, which suppresses the effect of outliers
* Responds well if the standard deviation is small and when a distribution is not Gaussian
Cons:
* Does not handle outliers very well
* Normalized data may not meet the need of some models, e.g., KDE
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # fit the scaler to the training data only
data_min_max = scaler.fit_transform(data) # scale can be use to scale test data
data_min_max = pd.DataFrame(data_min_max, columns = data.columns)
data_min_max.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
Pros
Cons
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_standarization = scaler.fit_transform(data)
data_standarization = pd.DataFrame(data_standarization, columns = data.columns)
data_standarization.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
Pros
Cons
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
data_abs_max = scaler.fit_transform(data)
data_abs_max = pd.DataFrame(data_abs_max, columns = data.columns)
data_abs_max.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
data_robust = scaler.fit_transform(data)
data_robust = pd.DataFrame(data_robust, columns = data.columns)
data_robust.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
Ranks the relationship between observations and map observations onto other distributions by the cumulative probability distribution (CDF), such as the uniform or normal distribution
Pros
from sklearn.preprocessing import QuantileTransformer
scaler = QuantileTransformer(output_distribution='uniform') # default output distribution is uniform
data_quantile = scaler.fit_transform(data)
data_quantile = pd.DataFrame(data_quantile, columns = data.columns)
data_quantile.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
scaler = QuantileTransformer(output_distribution='normal')
data_quantile = scaler.fit_transform(data)
data_quantile = pd.DataFrame(data_quantile, columns = data.columns)
data_quantile.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method='yeo-johnson') # default method is yeo-johnson
data_power = scaler.fit_transform(data)
data_power = pd.DataFrame(data_power, columns = data.columns)
data_power.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
data_unit_vector = data.apply(lambda x: x/np.linalg.norm(x, 2))
data_unit_vector = pd.DataFrame(data_unit_vector, columns = data.columns)
data_unit_vector.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'total_bedrooms'}>, <AxesSubplot:title={'center':'population'}>], [<AxesSubplot:title={'center':'households'}>, <AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>]], dtype=object)
data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values
from sklearn.preprocessing import Normalizer
scaler = Normalizer() # default norm is l2
data_unit_norm = scaler.fit_transform(data_temp)
data_unit_norm = pd.DataFrame(data_unit_norm, columns = data_temp.columns)
data_unit_norm.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'population'}>, <AxesSubplot:title={'center':'households'}>], [<AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>, <AxesSubplot:>]], dtype=object)
data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values
from sklearn.preprocessing import KBinsDiscretizer
transformer = KBinsDiscretizer(n_bins = 5, encode = 'onehot-dense', strategy='uniform')
data_bin = transformer.fit_transform(data_temp)
data_bin = pd.DataFrame(data_bin, columns = data_temp.columns)
data_bin.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'population'}>, <AxesSubplot:title={'center':'households'}>], [<AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>, <AxesSubplot:>]], dtype=object)
data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values
from sklearn.preprocessing import Binarizer
transformer = Binarizer( threshold = 20)
data_binary = transformer.fit_transform(data_temp)
data_binary = pd.DataFrame(data_binary, columns = data_temp.columns)
data_binary.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'population'}>, <AxesSubplot:title={'center':'households'}>], [<AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>, <AxesSubplot:>]], dtype=object)
data_temp = data.drop(['total_bedrooms'], axis=1) # not allow to have NaN values
def transform_f(x):
return x*10
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(transform_f, validate=True)
data_function = transformer.fit_transform(data_temp)
data_function = pd.DataFrame(data_function, columns = data_temp.columns)
data_function.hist(bins = 50, figsize = (20, 15))
array([[<AxesSubplot:title={'center':'longitude'}>, <AxesSubplot:title={'center':'latitude'}>, <AxesSubplot:title={'center':'housing_median_age'}>], [<AxesSubplot:title={'center':'total_rooms'}>, <AxesSubplot:title={'center':'population'}>, <AxesSubplot:title={'center':'households'}>], [<AxesSubplot:title={'center':'median_income'}>, <AxesSubplot:title={'center':'median_house_value'}>, <AxesSubplot:>]], dtype=object)