Multilabel and Multioutput Algorithms¶

1. Multilabel Classification¶

Labeling each sample with m labels from n_classes possible classes
m can be 0 to n_classes inclusive
Comparable to running n_classes binary classification tasks, a binary output is assigned to each class, for every sample

1.1 Multioutput Classifier¶

Some classifiers in sklearn support multilabel, such as KNeighborsClassifier, RandomForestClassifier, do not need use multioutput classifier
If a classifier do not support multilabel, such as LogisticRegression, need use multioutput classifier

from sklearn.datasets import load_digits
digits = load_digits()
X = digits.data
y = digits.target

from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

import numpy as np
y_train_large = (train_Y >= 7)
y_train_odd = (train_Y % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd] # create multiple labels

# by default, KNeighborsClassifier in sklearn supports multilabel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=1)
model.fit(train_X, y_multilabel)

y_pred = model.predict(test_X) # 360*2

from sklearn.multioutput import MultiOutputClassifier
forest = RandomForestClassifier(random_state=1)
model = MultiOutputClassifier(forest, n_jobs=-1)
model.fit(train_X, y_multilabel)

y_pred = model.predict(test_X) # 360*2

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=0)
model = MultiOutputClassifier(lr, n_jobs=-1)
model.fit(train_X, y_multilabel)

y_pred = model.predict(test_X) # 360*2

1.2 Classifier Chain¶

Combine a number of binary classifiers into a single multi-label model
Each model makes a prediction in the order specified by the chain using all of the available features provided to the model plus the predictions of models that are earlier in the chain

from sklearn.multioutput import ClassifierChain
model = RandomForestClassifier(random_state=1)

chain = ClassifierChain(model, order='random', random_state=0)
chain.fit(train_X, y_multilabel)

y_pred = chain.predict(test_X) # 360*2

2. Multioutput Regression¶

predicts multiple numerical properties for each sample

2.1 Multioutput Regressor¶

from sklearn.datasets import make_regression
X, y = make_regression(n_samples=10, n_targets=3, random_state=1)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor

# MultiOutputRegressor, fits one regressor per target it can not take advantage of correlations between targets
model = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
model.fit(X, y)

model.predict(X)

array([[-154.75474165, -147.03498585,  -50.03812219],
       [   7.12165031,    5.12914884,  -81.46081961],
       [-187.8948621 , -100.44373091,   13.88978285],
       [-141.62745778,   95.02891072, -191.48204257],
       [  97.03260883,  165.34867495,  139.52003279],
       [ 123.92529176,   21.25719016,   -7.84253   ],
       [-122.25193977,  -85.16443186, -107.12274212],
       [ -30.170388  ,  -94.80956739,   12.16979946],
       [ 140.72667194,  176.50941682,  -17.50447799],
       [ 149.37967282,  -81.15699552,   -5.72850319]])

2.2 Regressor Chain¶

Combining a number of regressions into a single multi-target model
Each model makes a prediction in the order specified by the chain using all of the available features provided to the model plus the predictions of models that are earlier in the chain.

from sklearn.multioutput import RegressorChain
from sklearn.linear_model import LogisticRegression
model = GradientBoostingRegressor()
chain = RegressorChain(base_estimator=model)
chain.fit(X, y)

chain.predict(X)

array([[-154.75474165, -147.03498585,  -50.03812219],
       [   7.12165031,    5.12914884,  -81.46081961],
       [-187.8948621 , -100.44373091,   13.88978285],
       [-141.62745778,   95.02891072, -191.48204257],
       [  97.03260883,  165.34867495,  139.52003279],
       [ 123.92529176,   21.25719016,   -7.84253   ],
       [-122.25193977,  -85.16443186, -107.12274212],
       [ -30.170388  ,  -94.80956739,   12.16979946],
       [ 140.72667194,  176.50941682,  -17.50447799],
       [ 149.37967282,  -81.15699552,   -5.72850319]])

3. Multiclass-multioutput Classification¶

Multitask classification
Labels each sample with a set of non-binary properties
All classifiers handling multiclass-multioutput support the multilabel classification task as a special case
Currently no Multiclass-multioutput classification class defined in sklearn like MultiOutputClassifier and MultiOutputRegressor

digits = load_digits()
X = digits.data
y = digits.target

train_X, test_X, train_Y, test_Y = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

noise = np.random.randint(0, 3, (len(train_X), 64))
X_train_mod = train_X + noise
noise = np.random.randint(0, 3, (len(test_X), 64))
X_test_mod = test_X + noise
y_train_mod = train_X
y_test_mod = test_X

import matplotlib
import matplotlib.pyplot as plt
plt.imshow(X_test_mod[100].reshape(8, 8), cmap = matplotlib.cm.binary, interpolation='nearest')

<matplotlib.image.AxesImage at 0x7ffbbc7f4690>

# by default, KNeighborsClassifier in sklearn supports multiclass-multioutput classification
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train_mod, y_train_mod)

clean_digit = knn_clf.predict([X_test_mod[100]])

plt.imshow(clean_digit.reshape(8, 8), cmap = matplotlib.cm.binary, interpolation='nearest')

<matplotlib.image.AxesImage at 0x7ffbbcb81210>