from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X_train, y_train)
from sklearn.tree import export_graphviz
export_graphviz(
tree_clf,
out_file="iris_tree.dot",
feature_names=iris.feature_names[2:],
class_names=iris.target_names,
rounded=True,
filled=True
)
from sklearn import tree
tree.plot_tree(tree_clf, feature_names=iris.feature_names[2:],class_names=iris.target_names, rounded=True, filled=True)
from sklearn.tree.export import export_text
r = export_text(tree_clf, feature_names=iris.feature_names[2:])
print(r)
print(iris.target_names)
tree_clf.predict([[5, 1.5]])[0]
tree_clf.predict_proba([[5, 1.5]]) # 0 for setosa, 0.026 for versicolor, 0.974 for virginica
tree_clf.feature_importances_ # feature importance, petal length is more important than width
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = DecisionTreeClassifier(random_state=0)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities # get alpha values and their impurities
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
fig, ax = plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post")
from sklearn.tree import DecisionTreeRegressor
X = [[0, 0], [2, 2]]
y = [0.5, 2.5]
clf = DecisionTreeRegressor()
clf = clf.fit(X, y)
clf.predict([[1, 1]])
import numpy as np
from sklearn.tree import DecisionTreeRegressor
rng = np.random.RandomState(1)
X = np.sort(200 * rng.rand(100, 1) - 100, axis=0)
y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
y[::5, :] += (0.5 - rng.rand(20, 2)) # add noise
regr = DecisionTreeRegressor(max_depth=6)
regr.fit(X, y)
X_test = np.arange(-100.0, 100.0, 0.01)[:, np.newaxis]
y_test = regr.predict(X_test)
plt.scatter(y[:, 0], y[:, 1], c="cornflowerblue", s=40, edgecolor="black")
plt.scatter(y_test[:, 0], y_test[:, 1], c="cornflowerblue", s=40, edgecolor="red")