from sklearn.datasets import fetch_openml
data_X, data_Y = fetch_openml('mnist_784', version=1, return_X_y=True)


import matplotlib
import matplotlib.pyplot as plt
plt.imshow(data_X[36000].reshape(28, 28), cmap = matplotlib.cm.binary, interpolation='nearest')

<matplotlib.image.AxesImage at 0x7fac9ed8d5d0>


from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(data_X, data_Y, test_size=0.2, random_state=42, stratify = data_Y)


train_X.shape, test_X.shape

((56000, 784), (14000, 784))


from scipy.ndimage.interpolation import shift

def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])


# Data Augmentation
# Shift MNIST image in any direction (left, right, up, or down) by one pixel, add then to the training set
def augmentation(train_X, train_Y):
    X_train_augmented = [image for image in train_X]
    y_train_augmented = [label for label in train_Y]

    for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
        for image, label in zip(train_X, train_Y):
            X_train_augmented.append(shift_image(image, dx, dy))
            y_train_augmented.append(label)

    X_train_augmented = np.array(X_train_augmented)
    y_train_augmented = np.array(y_train_augmented)
    
    return X_train_augmented, y_train_augmented


import numpy as np
train_X, train_Y = augmentation(train_X, train_Y)


train_X.shape, test_X.shape

((280000, 784), (14000, 784))


# feature scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


train_X = scaler.fit_transform(train_X)


test_X = scaler.transform(test_X)


models = []

from sklearn.ensemble import RandomForestClassifier
models.append(RandomForestClassifier()) # Random Forest Tree

from sklearn.naive_bayes import MultinomialNB
models.append(MultinomialNB()) # naive Bayes


import numpy as np
from sklearn.model_selection import cross_val_score

def cross_validation(model, X, Y, k = 10):
    scores = cross_val_score(model, X, Y, scoring='f1_macro', cv = k);
    return scores.mean(), scores.std() # represent perforance and precision respectively


for model, name in zip(models, ['rf', 'nb']):
    mean, std = cross_validation(model, train_X, train_Y, 2)
    print(name, mean, std)

/Users/lchen/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
/Users/lchen/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

rf 0.9252036273676647 0.016353611304756432
nb 0.7661275110772863 0.028429503729057393


from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [100, 200], 'max_features': ['sqrt', 'log2']}


grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1_macro')
grid_search.fit(train_X, train_Y)


grid_search.best_estimator_


grid_search.cv_results_['mean_test_score']


final_model = grid_search.best_estimator_
final_model.fit(train_X, train_Y)


pred = final_model.predict(test_X)


from sklearn.metrics import accuracy_score
accuracy_score(test_Y, pred)

Multiple Classes Classification¶

1. Get the Data¶

2. Discover and visualize the data to gain insights¶

3. Prepare Training Set and Test Set¶

4. Prepare the Data for Machine Learning Algorithms¶

5. Select and Train a Model¶

6. Fine-Tune Your Model¶

7. Train a Final Model¶

8. Evaluate System on the Test Set¶