import pandas as pd
data = pd.read_csv('housing.csv')
data.info()
median = data['total_bedrooms'].median()
data['total_bedrooms'].fillna(median, inplace=True)
import numpy as np
from sklearn.impute import SimpleImputer
data_num = data.drop('ocean_proximity', axis=1)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(data_num)
X = imp.transform(data_num)
data_tr = pd.DataFrame(X, columns=data_num.columns)
data_tr['ocean_proximity'] = data['ocean_proximity']
data_tr.info()
data_tr.head()
data_tr['ocean_proximity'].value_counts()
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer();
data_cat_binary = encoder.fit_transform(data_tr['ocean_proximity']) # numpy array
data_cat_binary
data_cat = pd.DataFrame(data_cat_binary, columns=encoder.classes_, index = data_tr.index)
data_cat = data_tr.join(data_cat)
data_cat = data_cat.drop('ocean_proximity', axis=1)
data_cat.head()
def add_features(data):
data_copy = data.copy()
data_copy["rooms_per_household"] = data_copy["total_rooms"]/data_copy["households"]
data_copy["bedrooms_per_room"] = data_copy["total_bedrooms"]/data_copy["total_rooms"]
data_copy["population_per_household"]=data_copy["population"]/data_copy["households"]
return data_copy
data_add = add_features(data_cat)
data_add.head()
data_scale = data_add.copy()
# min-max scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # fit the scaler to the training data only
scaler.fit(data_scale) # scale can be use to scale test data
data_scale = scaler.transform(data_scale)
data_scale.shape
# standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data_scale)
data_scale = scaler.transform(data_scale)
data_scale.shape
# check feature distribution
data_dist = pd.DataFrame(data_scale, columns=data_add.columns, index=data_add.index)
data_dist.hist(bins = 50, figsize = (20, 15))
# tranform tail heavy distribution to bell-shaped distribution
total_bedrooms = pd.DataFrame(data_dist['total_bedrooms'])
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
pt.fit(total_bedrooms)
bell = pt.transform(total_bedrooms)
import matplotlib.pyplot as plt
plt.hist(bell)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
features = poly.fit_transform(X) # numpy array
features.shape