Feature Selection

Get the Data

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv('data_3.csv')
In [2]:
data = data.drop(['Cryo', 'PDB', 'Cryo Type', 'neighbor trees'], axis = 1)
columns = {'Map Resolution': 'Resolution', 'vTree#':'Tree_N', 'numOfVoxels': 'Voxel_N', 'numberOfNeighbors':'N_Neighbors', 'Max Density':'Max_density', 'MinDensity':'Min_density', 'AvgDensity':'Avg_density', 'stdDensity':'Std_density', 'PCA_EigenValue1':'PCA_1', 'PCA_EigenValue2':'PCA_2', 'PCA_EigenValue3':'PCA_3', 'PCA Thickness Ratio ev2/ev3':'PCA_thick', 'Structure Tensor Helix (Percentage)':'Tensor_helix', 'Structure Tensor Sheet (Percentage)':'Tensor_sheet', 'Percentage of voxels with density less than average':'Per_voxel', 'Width':'Width', 'Hlx overlap (percentage)':'Hlx_Per', 'Strand overlap (percentage)':'Sheet_Per', 'Loop overlap (percentage)':'Loop_Per'}
data = data.rename(columns=columns)
In [3]:
temp = data[['Hlx_Per', 'Sheet_Per', 'Loop_Per']]
max_Per = np.max(temp, axis=1)
#temp.shape, max_Per.shape
data['Max_Per'] = max_Per
data.head()
threshold = 0.9 # select data whose maximum overlap percentage is greater than a threshold
data = data[data['Max_Per'] > threshold]
data = data.drop(['Hlx_Per', 'Sheet_Per', 'Loop_Per', 'Max_Per'], axis = 1)
In [4]:
data_Helix = data[data['Label'] == 'Helix']
data_Sheet = data[data['Label'] == 'Sheet']
data_Loop = data[data['Label'] == 'Loop']

data_Helix = data_Helix.sample(n = 350, random_state = 42)
data_Sheet = data_Sheet
data_Loop = data_Loop.sample(n = 350, random_state = 42)
print(data_Helix.shape, data_Sheet.shape, data_Loop.shape)
(350, 17) (309, 17) (350, 17)
In [5]:
data = pd.concat([data_Helix, data_Sheet, data_Loop])
In [6]:
data.head()
Out[6]:
Resolution Tree_N Voxel_N N_Neighbors Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Label
3107 5.9 6 190 8 0.591599 0.454467 0.512721 0.041351 10.115200 1.821830 1.311840 1.39 0.868 0.132 0.547368 2.80592 Helix
3528 5.0 33 115 2 0.596935 0.452163 0.516925 0.031808 4.287560 1.892360 1.003600 1.89 0.400 0.600 0.504348 3.47828 Helix
1520 5.0 18 253 7 0.715440 0.497692 0.594844 0.048938 9.418030 3.690890 1.078860 3.42 0.419 0.581 0.561265 6.01528 Helix
750 5.9 158 11 1 0.484277 0.420684 0.454697 0.023213 0.788791 0.437241 0.173968 2.51 0.455 0.545 0.454545 2.29103 Helix
768 5.9 78 67 3 0.524908 0.431320 0.467359 0.025337 2.341210 1.213610 0.941341 1.29 0.776 0.224 0.597015 4.58205 Helix

Discover and visualize the data to gain insights

In [7]:
data.hist(bins = 50, figsize = (20, 15))
Out[7]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12346ef50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12346e850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x123e5cc50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x123e9cfd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x123ed1c90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x123f0f4d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x123f45cd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x123f82510>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x123f8f090>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x123fc4a10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12402dd50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12406d590>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1240a1d90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1240e15d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x124113dd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x124156610>]],
      dtype=object)

Prepare Training Set and Test Set

In [8]:
data_X = data.iloc[:, 0:-1]
data_Y = data.iloc[:, -1]
In [9]:
import warnings
warnings.filterwarnings('ignore')
In [10]:
data_Y[data_Y == 'Helix'] =0
data_Y[data_Y == 'Sheet'] =1
data_Y[data_Y == 'Loop'] = 2
In [11]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data_Y)
train_Y = train_Y.astype(np.int64)
test_Y = test_Y.astype(np.int64)
In [12]:
# Select specific columns
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]
    
# Create Pipeline for Numeric Columns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer

left_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(['Tree_N', 'Voxel_N', 'Max_density',
       'Min_density', 'Avg_density', 'Std_density', 'PCA_1', 'PCA_2', 'PCA_3',
       'PCA_thick', 'Tensor_helix', 'Tensor_sheet', 'Per_voxel', 'Width'])),
    ("transform", QuantileTransformer(n_quantiles=200, output_distribution='normal', random_state=42))
])

right_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(['Resolution', 'N_Neighbors'])),
])

from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
        ("left_pipeline", left_pipeline),
        ("right_pipeline", right_pipeline),
    ])

train_X = pd.DataFrame(preprocess_pipeline.fit_transform(train_X), columns=['Tree_N', 'Voxel_N', 'Max_density', 'Min_density', 'Avg_density', 'Std_density', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_thick', 'Tensor_helix','Tensor_sheet', 'Per_voxel', 'Width', 'Resolution', 'N_Neighbors'])
test_X = pd.DataFrame(preprocess_pipeline.fit_transform(test_X), columns=['Tree_N', 'Voxel_N', 'Max_density', 'Min_density', 'Avg_density', 'Std_density', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_thick', 'Tensor_helix','Tensor_sheet', 'Per_voxel', 'Width', 'Resolution', 'N_Neighbors'])
In [13]:
train_X.head()
train_X.hist(bins = 50, figsize = (20, 15))
Out[13]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a27ef53d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2818ef50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a281f76d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a28229ed0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a2826b710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a2829ff10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a282e1750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a28317f50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a28320ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a28362490>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a283cd7d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a283fefd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1a28441810>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a28474f90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a284b7850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1a284f4bd0>]],
      dtype=object)

Select Features

1. Removing features with low variance

  • removes all features whose variance is less than the threshold
In [14]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=0.16)
X_new = sel.fit_transform(X)
In [15]:
sel.inverse_transform(X_new) # inverse the features, the columns removes are all 0s
Out[15]:
array([[0, 0, 1],
       [0, 1, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 1, 0],
       [0, 1, 1]])

2. Univariate feature selection

In [16]:
import numpy as np
from sklearn.model_selection import cross_val_score

def cross_validation(model, X, Y, k = 10, metric = 'accuracy'):
    scores = cross_val_score(model, X, Y, scoring=metric, cv = k);
    return scores.mean(), scores.std()

2.1 selectKBest

  • For regression: f_regression, mutual_info_regression
  • For classification: chi2, f_classif, mutual_info_classif
In [17]:
# select best k features, k = 1, 2, ...
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

X_new = []
for i in range(1, 17):
    X_new.append(SelectKBest(f_classif, k=i).fit_transform(train_X, train_Y))
In [18]:
# use the cross-validation to decide the best k value
for i in range(1, 17):
    mean, std = cross_validation(RandomForestClassifier(), X_new[i-1], train_Y, 10)
    print(i, mean, std)
1 0.3928858024691358 0.04996962430845789
2 0.40774691358024695 0.041735984863855956
3 0.44733024691358025 0.05040275838193024
4 0.49550925925925926 0.06819941717751807
5 0.5141512345679012 0.060348072155700386
6 0.5574845679012347 0.06086120220573822
7 0.5562654320987654 0.05399731713319364
8 0.574891975308642 0.04032582218055098
9 0.5861265432098766 0.0489684811059409
10 0.5922530864197532 0.04571918070378082
11 0.5835493827160494 0.041302316452074106
12 0.5822839506172839 0.057023006130564866
13 0.587391975308642 0.047905124750555174
14 0.5922685185185186 0.05179737226526403
15 0.5898302469135801 0.042483951285476065
16 0.593503086419753 0.04605836234975256
In [19]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectKBest(f_classif, k=14)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[19]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.494709 -0.480344 -0.917843 0.0 -0.965648 -0.911923 1.019514 -1.019514 -0.702428 0.0 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 -0.814769 -1.307636 0.592708 0.0 -0.705799 1.382184 -0.202922 0.196497 -1.197786 0.0 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 -0.643438 -0.462057 -0.930939 0.0 -0.133523 -0.421532 -0.506159 0.506159 0.549132 0.0 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 -0.903668 -1.511897 -0.983446 0.0 -0.891406 -0.313674 -1.657878 1.657878 -0.248148 0.0 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.043512 -0.299451 -0.420808 0.0 -0.056908 -0.380576 0.132647 -0.132647 -1.293090 0.0 6.1 3.0

2.2 SelectPercentile

  • Select features according to a percentile of the important features with scores
In [46]:
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.ensemble import RandomForestClassifier

percentages = range(5, 105, 5)
X_new = []
for i in percentages:
    X_new.append(SelectPercentile(f_classif, percentile=i).fit_transform(train_X, train_Y))
In [48]:
# use the cross-validation to decide the best k value
for index, value in enumerate(percentages):
    mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
    print(value, mean, std)
5 0.39663580246913577 0.06143513672969395
10 0.40035493827160495 0.061461151830623055
15 0.4497993827160493 0.0570235594980671
20 0.4497993827160493 0.0570235594980671
25 0.50429012345679 0.05429260622204982
30 0.5203395061728395 0.04834750198331053
35 0.5562962962962963 0.044284574006013315
40 0.5562962962962963 0.044284574006013315
45 0.5761728395061729 0.041615549348398215
50 0.5872530864197532 0.04662339181712164
55 0.5736265432098765 0.04764268802313536
60 0.5736265432098765 0.04764268802313536
65 0.5835956790123457 0.040524764816158625
70 0.5786265432098765 0.04414024032337746
75 0.5947222222222223 0.0466612569534136
80 0.5947222222222223 0.0466612569534136
85 0.596033950617284 0.03632357107517241
90 0.605817901234568 0.050622837790010665
95 0.6108950617283951 0.04981041637687091
100 0.597145061728395 0.0481431151413458
In [49]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectPercentile(f_classif, percentile=95)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[49]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.494709 -0.480344 -0.917843 0.0 -0.965648 -0.911923 1.019514 -1.019514 -0.702428 -1.573609 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 -0.814769 -1.307636 0.592708 0.0 -0.705799 1.382184 -0.202922 0.196497 -1.197786 -0.094612 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 -0.643438 -0.462057 -0.930939 0.0 -0.133523 -0.421532 -0.506159 0.506159 0.549132 -0.340250 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 -0.903668 -1.511897 -0.983446 0.0 -0.891406 -0.313674 -1.657878 1.657878 -0.248148 -0.435339 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.043512 -0.299451 -0.420808 0.0 -0.056908 -0.380576 0.132647 -0.132647 -1.293090 0.188674 6.1 3.0

2.3 SelectFpr

  • Select the pvalues below alpha based on a false positive rate (FPR) test
In [51]:
from sklearn.feature_selection import SelectFpr, chi2
from sklearn.ensemble import RandomForestClassifier

alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
X_new = []
for i in alphas:
    X_new.append(SelectFpr(f_classif, alpha=i).fit_transform(train_X, train_Y))
In [52]:
# use the cross-validation to decide the best k value
for index, value in enumerate(alphas):
    mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
    print(value, mean, std)
0.01 0.6108950617283951 0.04981041637687091
0.02 0.597145061728395 0.0481431151413458
0.03 0.597145061728395 0.0481431151413458
0.04 0.597145061728395 0.0481431151413458
0.05 0.597145061728395 0.0481431151413458
0.06 0.597145061728395 0.0481431151413458
0.07 0.597145061728395 0.0481431151413458
0.08 0.597145061728395 0.0481431151413458
0.09 0.597145061728395 0.0481431151413458
0.1 0.597145061728395 0.0481431151413458
In [53]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectFpr(f_classif, alpha=0.01)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[53]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.494709 -0.480344 -0.917843 0.0 -0.965648 -0.911923 1.019514 -1.019514 -0.702428 -1.573609 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 -0.814769 -1.307636 0.592708 0.0 -0.705799 1.382184 -0.202922 0.196497 -1.197786 -0.094612 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 -0.643438 -0.462057 -0.930939 0.0 -0.133523 -0.421532 -0.506159 0.506159 0.549132 -0.340250 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 -0.903668 -1.511897 -0.983446 0.0 -0.891406 -0.313674 -1.657878 1.657878 -0.248148 -0.435339 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.043512 -0.299451 -0.420808 0.0 -0.056908 -0.380576 0.132647 -0.132647 -1.293090 0.188674 6.1 3.0

2.4 SelectFdr

  • Select the p-values for an estimated false discovery rate
In [54]:
from sklearn.feature_selection import SelectFdr, chi2
from sklearn.ensemble import RandomForestClassifier

alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
X_new = []
for i in alphas:
    X_new.append(SelectFdr(f_classif, alpha=i).fit_transform(train_X, train_Y))
In [55]:
# use the cross-validation to decide the best k value
for index, value in enumerate(alphas):
    mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
    print(value, mean, std)
0.01 0.6108950617283951 0.04981041637687091
0.02 0.597145061728395 0.0481431151413458
0.03 0.597145061728395 0.0481431151413458
0.04 0.597145061728395 0.0481431151413458
0.05 0.597145061728395 0.0481431151413458
0.06 0.597145061728395 0.0481431151413458
0.07 0.597145061728395 0.0481431151413458
0.08 0.597145061728395 0.0481431151413458
0.09 0.597145061728395 0.0481431151413458
0.1 0.597145061728395 0.0481431151413458
In [56]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectFdr(f_classif, alpha=0.01)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[56]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.494709 -0.480344 -0.917843 0.0 -0.965648 -0.911923 1.019514 -1.019514 -0.702428 -1.573609 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 -0.814769 -1.307636 0.592708 0.0 -0.705799 1.382184 -0.202922 0.196497 -1.197786 -0.094612 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 -0.643438 -0.462057 -0.930939 0.0 -0.133523 -0.421532 -0.506159 0.506159 0.549132 -0.340250 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 -0.903668 -1.511897 -0.983446 0.0 -0.891406 -0.313674 -1.657878 1.657878 -0.248148 -0.435339 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.043512 -0.299451 -0.420808 0.0 -0.056908 -0.380576 0.132647 -0.132647 -1.293090 0.188674 6.1 3.0

2.5 SelectFwe

  • Select the p-values corresponding to Family-wise error rate
  • The FWER is the probability of making at least one type I error (false positive) in the family
In [59]:
from sklearn.feature_selection import SelectFwe, chi2
from sklearn.ensemble import RandomForestClassifier

alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
X_new = []
for i in alphas:
    X_new.append(SelectFwe(f_classif, alpha=i).fit_transform(train_X, train_Y))
In [60]:
# use the cross-validation to decide the best k value
for index, value in enumerate(alphas):
    mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
    print(value, mean, std)
0.01 0.6108950617283951 0.04981041637687091
0.02 0.6108950617283951 0.04981041637687091
0.03 0.6108950617283951 0.04981041637687091
0.04 0.6108950617283951 0.04981041637687091
0.05 0.6108950617283951 0.04981041637687091
0.06 0.6108950617283951 0.04981041637687091
0.07 0.6108950617283951 0.04981041637687091
0.08 0.6108950617283951 0.04981041637687091
0.09 0.6108950617283951 0.04981041637687091
0.1 0.6108950617283951 0.04981041637687091
In [61]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectFwe(f_classif, alpha=0.01)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[61]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.494709 -0.480344 -0.917843 0.0 -0.965648 -0.911923 1.019514 -1.019514 -0.702428 -1.573609 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 -0.814769 -1.307636 0.592708 0.0 -0.705799 1.382184 -0.202922 0.196497 -1.197786 -0.094612 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 -0.643438 -0.462057 -0.930939 0.0 -0.133523 -0.421532 -0.506159 0.506159 0.549132 -0.340250 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 -0.903668 -1.511897 -0.983446 0.0 -0.891406 -0.313674 -1.657878 1.657878 -0.248148 -0.435339 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.043512 -0.299451 -0.420808 0.0 -0.056908 -0.380576 0.132647 -0.132647 -1.293090 0.188674 6.1 3.0

2.6 GenericUnivariateSelect

  • Univariate feature selector with modes: ‘percentile’, ‘k_best’, ‘fpr’, ‘fdr’, ‘fwe’
In [62]:
# select best k features, k = 1, 2, ...
from sklearn.feature_selection import GenericUnivariateSelect, chi2
from sklearn.ensemble import RandomForestClassifier

X_new = []
for i in range(1, 17):
    X_new.append(GenericUnivariateSelect(f_classif, mode='k_best', param=i).fit_transform(train_X, train_Y))
In [63]:
# use the cross-validation to decide the best k value
for i in range(1, 17):
    mean, std = cross_validation(RandomForestClassifier(), X_new[i-1], train_Y, 10)
    print(i, mean, std)
1 0.3978395061728395 0.06922558103735099
2 0.40401234567901234 0.06159449211268092
3 0.45606481481481487 0.04367202426931814
4 0.4905864197530865 0.07240914399104736
5 0.5228086419753086 0.06750455115141904
6 0.5550308641975309 0.040187520081604984
7 0.5687037037037036 0.03910323303014139
8 0.5773148148148148 0.05367460542746953
9 0.5872993827160493 0.0497242745244736
10 0.5774074074074075 0.047114538724082305
11 0.5848611111111112 0.044532579899829214
12 0.5922222222222222 0.0594669152944048
13 0.57625 0.05266025694544926
14 0.5885956790123457 0.05625259100876067
15 0.5959722222222223 0.057707072072007884
16 0.5810185185185185 0.05987964545864488
In [65]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = GenericUnivariateSelect(f_classif, mode='k_best', param=14)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[65]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.494709 -0.480344 -0.917843 0.0 -0.965648 -0.911923 1.019514 -1.019514 -0.702428 0.0 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 -0.814769 -1.307636 0.592708 0.0 -0.705799 1.382184 -0.202922 0.196497 -1.197786 0.0 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 -0.643438 -0.462057 -0.930939 0.0 -0.133523 -0.421532 -0.506159 0.506159 0.549132 0.0 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 -0.903668 -1.511897 -0.983446 0.0 -0.891406 -0.313674 -1.657878 1.657878 -0.248148 0.0 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.043512 -0.299451 -0.420808 0.0 -0.056908 -0.380576 0.132647 -0.132647 -1.293090 0.0 6.1 3.0

3. Recursive feature elimination

  • the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a featureimportances attribute
  • the least important features are pruned from current set of features
  • That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached

3.1 REF

In [68]:
# select best k features, k = 1, 2, ...
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

X_new = []
for i in range(1, 17):
    X_new.append(RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=i, step=1).fit_transform(train_X, train_Y))
In [69]:
# use the cross-validation to decide the feature number
for i in range(1, 17):
    mean, std = cross_validation(RandomForestClassifier(), X_new[i-1], train_Y, 10)
    print(i, mean, std)
1 0.4189814814814815 0.05721746920476154
2 0.4723611111111111 0.06615841532780373
3 0.5364506172839507 0.03918238747043045
4 0.565 0.04416173804147057
5 0.5798148148148148 0.050269847952512485
6 0.5885185185185186 0.046280460254536596
7 0.5923302469135802 0.05369213059988803
8 0.5774074074074075 0.043102390562996445
9 0.5762037037037037 0.040585762563331566
10 0.5872993827160494 0.04316951653454727
11 0.5860648148148149 0.03977735641027943
12 0.5860493827160493 0.04068600185367656
13 0.5909876543209877 0.04812470741522746
14 0.5872222222222223 0.0445588483846298
15 0.5873148148148147 0.051099908565357524
16 0.5934567901234568 0.05577076799470397
In [70]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=7, step=1)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[70]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 0.0 0.207948 0.926425 0.494709 0.0 0.0 0.0 -0.965648 0.0 1.019514 -1.019514 0.0 0.0 0.0 0.0
1 -1.237018 0.0 -1.106579 -0.703618 -0.814769 0.0 0.0 0.0 -0.705799 0.0 -0.202922 0.196497 0.0 0.0 0.0 0.0
2 1.029744 0.0 -0.500216 -0.500815 -0.643438 0.0 0.0 0.0 -0.133523 0.0 -0.506159 0.506159 0.0 0.0 0.0 0.0
3 -0.360340 0.0 -0.996178 -0.394536 -0.903668 0.0 0.0 0.0 -0.891406 0.0 -1.657878 1.657878 0.0 0.0 0.0 0.0
4 0.414658 0.0 -0.018705 0.219565 0.043512 0.0 0.0 0.0 -0.056908 0.0 0.132647 -0.132647 0.0 0.0 0.0 0.0

3.2 REF CV

In [73]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

rfecv = RFECV(estimator=RandomForestClassifier(random_state=42), step=1, cv=10, scoring='accuracy')
X_new = rfecv.fit_transform(train_X, train_Y)
In [76]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
X_inverse = pd.DataFrame(rfecv.inverse_transform(X_new), columns = train_X.columns)
print("Optimal number of features : %d" % rfecv.n_features_)
X_inverse.head()
Optimal number of features : 16
Out[76]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.494709 -0.480344 -0.917843 -1.361738 -0.965648 -0.911923 1.019514 -1.019514 -0.702428 -1.573609 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 -0.814769 -1.307636 0.592708 0.238773 -0.705799 1.382184 -0.202922 0.196497 -1.197786 -0.094612 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 -0.643438 -0.462057 -0.930939 -0.363756 -0.133523 -0.421532 -0.506159 0.506159 0.549132 -0.340250 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 -0.903668 -1.511897 -0.983446 -1.040810 -0.891406 -0.313674 -1.657878 1.657878 -0.248148 -0.435339 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.043512 -0.299451 -0.420808 -0.269689 -0.056908 -0.380576 0.132647 -0.132647 -1.293090 0.188674 6.1 3.0

4. Feature selection using SelectFromModel

  • can be used along with any estimator that has a coef_ or featureimportances attribute

4.1 L1-based feature selection

In [81]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_X, train_Y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train_X)
In [82]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
X_inverse = pd.DataFrame(model.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[82]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 -1.005535 0.207948 0.926425 0.0 -0.480344 0.0 0.0 0.0 -0.911923 1.019514 -1.019514 -0.702428 -1.573609 5.0 1.0
1 -1.237018 0.017991 -1.106579 -0.703618 0.0 -1.307636 0.0 0.0 0.0 1.382184 -0.202922 0.196497 -1.197786 -0.094612 5.6 2.0
2 1.029744 -0.484345 -0.500216 -0.500815 0.0 -0.462057 0.0 0.0 0.0 -0.421532 -0.506159 0.506159 0.549132 -0.340250 5.0 2.0
3 -0.360340 -1.044006 -0.996178 -0.394536 0.0 -1.511897 0.0 0.0 0.0 -0.313674 -1.657878 1.657878 -0.248148 -0.435339 5.6 3.0
4 0.414658 -0.326933 -0.018705 0.219565 0.0 -0.299451 0.0 0.0 0.0 -0.380576 0.132647 -0.132647 -1.293090 0.188674 6.1 3.0

4.2 Tree-based feature selection

In [84]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

et_clf = ExtraTreesClassifier(n_estimators=50).fit(train_X, train_Y)
model = SelectFromModel(et_clf, prefit=True)
X_new = model.transform(train_X)
In [85]:
# Use the inverse transform to check which features are selected, the removed feature have all 0s
X_inverse = pd.DataFrame(model.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
Out[85]:
Tree_N Voxel_N Max_density Min_density Avg_density Std_density PCA_1 PCA_2 PCA_3 PCA_thick Tensor_helix Tensor_sheet Per_voxel Width Resolution N_Neighbors
0 1.166755 0.0 0.207948 0.926425 0.494709 -0.480344 0.0 0.0 0.0 0.0 1.019514 -1.019514 0.0 0.0 0.0 0.0
1 -1.237018 0.0 -1.106579 -0.703618 -0.814769 -1.307636 0.0 0.0 0.0 0.0 -0.202922 0.196497 0.0 0.0 0.0 0.0
2 1.029744 0.0 -0.500216 -0.500815 -0.643438 -0.462057 0.0 0.0 0.0 0.0 -0.506159 0.506159 0.0 0.0 0.0 0.0
3 -0.360340 0.0 -0.996178 -0.394536 -0.903668 -1.511897 0.0 0.0 0.0 0.0 -1.657878 1.657878 0.0 0.0 0.0 0.0
4 0.414658 0.0 -0.018705 0.219565 0.043512 -0.299451 0.0 0.0 0.0 0.0 0.132647 -0.132647 0.0 0.0 0.0 0.0

Pipeline

In [86]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

clf = Pipeline([
  ('feature_selection', SelectKBest(f_classif, k=14)),
  ('classification', RandomForestClassifier(random_state=42))
])
clf.fit(train_X, train_Y)
Out[86]:
Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectKBest(k=14,
                             score_func=<function f_classif at 0x1a29109b00>)),
                ('classification',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_start=False))],
         verbose=False)
In [88]:
y_test_pred = clf.predict(test_X)
In [89]:
from sklearn.metrics import classification_report
print(classification_report(test_Y, y_test_pred, target_names=['Helix', 'Sheet', 'Loop']))
              precision    recall  f1-score   support

       Helix       0.57      0.56      0.56        70
       Sheet       0.48      0.50      0.49        62
        Loop       0.51      0.50      0.51        70

    accuracy                           0.52       202
   macro avg       0.52      0.52      0.52       202
weighted avg       0.52      0.52      0.52       202