from sklearn.datasets import fetch_openml
data_X, data_Y = fetch_openml('mnist_784', version=1, return_X_y=True)
import matplotlib
import matplotlib.pyplot as plt
plt.imshow(data_X[36000].reshape(28, 28), cmap = matplotlib.cm.binary, interpolation='nearest')
<matplotlib.image.AxesImage at 0x7fac9ed8d5d0>
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data_Y)
train_X.shape, test_X.shape
((56000, 784), (14000, 784))
from scipy.ndimage.interpolation import shift
def shift_image(image, dx, dy):
image = image.reshape((28, 28))
shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
return shifted_image.reshape([-1])
# Data Augmentation
# Shift MNIST image in any direction (left, right, up, or down) by one pixel, add then to the training set
def augmentation(train_X, train_Y):
X_train_augmented = [image for image in train_X]
y_train_augmented = [label for label in train_Y]
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
for image, label in zip(train_X, train_Y):
X_train_augmented.append(shift_image(image, dx, dy))
y_train_augmented.append(label)
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)
return X_train_augmented, y_train_augmented
import numpy as np
train_X, train_Y = augmentation(train_X, train_Y)
train_X.shape, test_X.shape
((280000, 784), (14000, 784))
# feature scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)
models = []
from sklearn.ensemble import RandomForestClassifier
models.append(RandomForestClassifier()) # Random Forest Tree
from sklearn.naive_bayes import MultinomialNB
models.append(MultinomialNB()) # naive Bayes
import numpy as np
from sklearn.model_selection import cross_val_score
def cross_validation(model, X, Y, k = 10):
scores = cross_val_score(model, X, Y, scoring='f1_macro', cv = k);
return scores.mean(), scores.std() # represent perforance and precision respectively
for model, name in zip(models, ['rf', 'nb']):
mean, std = cross_validation(model, train_X, train_Y, 2)
print(name, mean, std)
/Users/lchen/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) /Users/lchen/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning)
rf 0.9252036273676647 0.016353611304756432 nb 0.7661275110772863 0.028429503729057393
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [100, 200], 'max_features': ['sqrt', 'log2']}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1_macro')
grid_search.fit(train_X, train_Y)
grid_search.best_estimator_
grid_search.cv_results_['mean_test_score']
final_model = grid_search.best_estimator_
final_model.fit(train_X, train_Y)
pred = final_model.predict(test_X)
from sklearn.metrics import accuracy_score
accuracy_score(test_Y, pred)