import numpy as np
import pandas as pd
data = pd.read_csv('data_3.csv')
data = data.drop(['Cryo', 'PDB', 'Cryo Type', 'neighbor trees'], axis = 1)
columns = {'Map Resolution': 'Resolution', 'vTree#':'Tree_N', 'numOfVoxels': 'Voxel_N', 'numberOfNeighbors':'N_Neighbors', 'Max Density':'Max_density', 'MinDensity':'Min_density', 'AvgDensity':'Avg_density', 'stdDensity':'Std_density', 'PCA_EigenValue1':'PCA_1', 'PCA_EigenValue2':'PCA_2', 'PCA_EigenValue3':'PCA_3', 'PCA Thickness Ratio ev2/ev3':'PCA_thick', 'Structure Tensor Helix (Percentage)':'Tensor_helix', 'Structure Tensor Sheet (Percentage)':'Tensor_sheet', 'Percentage of voxels with density less than average':'Per_voxel', 'Width':'Width', 'Hlx overlap (percentage)':'Hlx_Per', 'Strand overlap (percentage)':'Sheet_Per', 'Loop overlap (percentage)':'Loop_Per'}
data = data.rename(columns=columns)
temp = data[['Hlx_Per', 'Sheet_Per', 'Loop_Per']]
max_Per = np.max(temp, axis=1)
#temp.shape, max_Per.shape
data['Max_Per'] = max_Per
data.head()
threshold = 0.9 # select data whose maximum overlap percentage is greater than a threshold
data = data[data['Max_Per'] > threshold]
data = data.drop(['Hlx_Per', 'Sheet_Per', 'Loop_Per', 'Max_Per'], axis = 1)
data_Helix = data[data['Label'] == 'Helix']
data_Sheet = data[data['Label'] == 'Sheet']
data_Loop = data[data['Label'] == 'Loop']
data_Helix = data_Helix.sample(n = 350, random_state = 42)
data_Sheet = data_Sheet
data_Loop = data_Loop.sample(n = 350, random_state = 42)
print(data_Helix.shape, data_Sheet.shape, data_Loop.shape)
data = pd.concat([data_Helix, data_Sheet, data_Loop])
data.head()
data.hist(bins = 50, figsize = (20, 15))
data_X = data.iloc[:, 0:-1]
data_Y = data.iloc[:, -1]
import warnings
warnings.filterwarnings('ignore')
data_Y[data_Y == 'Helix'] =0
data_Y[data_Y == 'Sheet'] =1
data_Y[data_Y == 'Loop'] = 2
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data_Y)
train_Y = train_Y.astype(np.int64)
test_Y = test_Y.astype(np.int64)
# Select specific columns
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
# Create Pipeline for Numeric Columns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
left_pipeline = Pipeline([
("select_numeric", DataFrameSelector(['Tree_N', 'Voxel_N', 'Max_density',
'Min_density', 'Avg_density', 'Std_density', 'PCA_1', 'PCA_2', 'PCA_3',
'PCA_thick', 'Tensor_helix', 'Tensor_sheet', 'Per_voxel', 'Width'])),
("transform", QuantileTransformer(n_quantiles=200, output_distribution='normal', random_state=42))
])
right_pipeline = Pipeline([
("select_numeric", DataFrameSelector(['Resolution', 'N_Neighbors'])),
])
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
("left_pipeline", left_pipeline),
("right_pipeline", right_pipeline),
])
train_X = pd.DataFrame(preprocess_pipeline.fit_transform(train_X), columns=['Tree_N', 'Voxel_N', 'Max_density', 'Min_density', 'Avg_density', 'Std_density', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_thick', 'Tensor_helix','Tensor_sheet', 'Per_voxel', 'Width', 'Resolution', 'N_Neighbors'])
test_X = pd.DataFrame(preprocess_pipeline.fit_transform(test_X), columns=['Tree_N', 'Voxel_N', 'Max_density', 'Min_density', 'Avg_density', 'Std_density', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_thick', 'Tensor_helix','Tensor_sheet', 'Per_voxel', 'Width', 'Resolution', 'N_Neighbors'])
train_X.head()
train_X.hist(bins = 50, figsize = (20, 15))
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
sel = VarianceThreshold(threshold=0.16)
X_new = sel.fit_transform(X)
sel.inverse_transform(X_new) # inverse the features, the columns removes are all 0s
import numpy as np
from sklearn.model_selection import cross_val_score
def cross_validation(model, X, Y, k = 10, metric = 'accuracy'):
scores = cross_val_score(model, X, Y, scoring=metric, cv = k);
return scores.mean(), scores.std()
# select best k features, k = 1, 2, ...
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
X_new = []
for i in range(1, 17):
X_new.append(SelectKBest(f_classif, k=i).fit_transform(train_X, train_Y))
# use the cross-validation to decide the best k value
for i in range(1, 17):
mean, std = cross_validation(RandomForestClassifier(), X_new[i-1], train_Y, 10)
print(i, mean, std)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectKBest(f_classif, k=14)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.ensemble import RandomForestClassifier
percentages = range(5, 105, 5)
X_new = []
for i in percentages:
X_new.append(SelectPercentile(f_classif, percentile=i).fit_transform(train_X, train_Y))
# use the cross-validation to decide the best k value
for index, value in enumerate(percentages):
mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
print(value, mean, std)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectPercentile(f_classif, percentile=95)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
from sklearn.feature_selection import SelectFpr, chi2
from sklearn.ensemble import RandomForestClassifier
alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
X_new = []
for i in alphas:
X_new.append(SelectFpr(f_classif, alpha=i).fit_transform(train_X, train_Y))
# use the cross-validation to decide the best k value
for index, value in enumerate(alphas):
mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
print(value, mean, std)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectFpr(f_classif, alpha=0.01)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
from sklearn.feature_selection import SelectFdr, chi2
from sklearn.ensemble import RandomForestClassifier
alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
X_new = []
for i in alphas:
X_new.append(SelectFdr(f_classif, alpha=i).fit_transform(train_X, train_Y))
# use the cross-validation to decide the best k value
for index, value in enumerate(alphas):
mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
print(value, mean, std)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectFdr(f_classif, alpha=0.01)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
from sklearn.feature_selection import SelectFwe, chi2
from sklearn.ensemble import RandomForestClassifier
alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
X_new = []
for i in alphas:
X_new.append(SelectFwe(f_classif, alpha=i).fit_transform(train_X, train_Y))
# use the cross-validation to decide the best k value
for index, value in enumerate(alphas):
mean, std = cross_validation(RandomForestClassifier(random_state = 42), X_new[index], train_Y, 10)
print(value, mean, std)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = SelectFwe(f_classif, alpha=0.01)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
# select best k features, k = 1, 2, ...
from sklearn.feature_selection import GenericUnivariateSelect, chi2
from sklearn.ensemble import RandomForestClassifier
X_new = []
for i in range(1, 17):
X_new.append(GenericUnivariateSelect(f_classif, mode='k_best', param=i).fit_transform(train_X, train_Y))
# use the cross-validation to decide the best k value
for i in range(1, 17):
mean, std = cross_validation(RandomForestClassifier(), X_new[i-1], train_Y, 10)
print(i, mean, std)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = GenericUnivariateSelect(f_classif, mode='k_best', param=14)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
# select best k features, k = 1, 2, ...
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
X_new = []
for i in range(1, 17):
X_new.append(RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=i, step=1).fit_transform(train_X, train_Y))
# use the cross-validation to decide the feature number
for i in range(1, 17):
mean, std = cross_validation(RandomForestClassifier(), X_new[i-1], train_Y, 10)
print(i, mean, std)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
sel = RFE(estimator=RandomForestClassifier(random_state=42), n_features_to_select=7, step=1)
X_new = sel.fit_transform(train_X, train_Y)
X_inverse = pd.DataFrame(sel.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
rfecv = RFECV(estimator=RandomForestClassifier(random_state=42), step=1, cv=10, scoring='accuracy')
X_new = rfecv.fit_transform(train_X, train_Y)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
X_inverse = pd.DataFrame(rfecv.inverse_transform(X_new), columns = train_X.columns)
print("Optimal number of features : %d" % rfecv.n_features_)
X_inverse.head()
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_X, train_Y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(train_X)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
X_inverse = pd.DataFrame(model.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
et_clf = ExtraTreesClassifier(n_estimators=50).fit(train_X, train_Y)
model = SelectFromModel(et_clf, prefit=True)
X_new = model.transform(train_X)
# Use the inverse transform to check which features are selected, the removed feature have all 0s
X_inverse = pd.DataFrame(model.inverse_transform(X_new), columns = train_X.columns)
X_inverse.head()
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
clf = Pipeline([
('feature_selection', SelectKBest(f_classif, k=14)),
('classification', RandomForestClassifier(random_state=42))
])
clf.fit(train_X, train_Y)
y_test_pred = clf.predict(test_X)
from sklearn.metrics import classification_report
print(classification_report(test_Y, y_test_pred, target_names=['Helix', 'Sheet', 'Loop']))