Density Estimation

One-Dimensional Kernal Density Estimation

Generate 1000 random number with binomial distribution

In [81]:
import numpy as np

# generate binomial random numbers
def make_data(N, f=0.3, rseed=1):
    rand = np.random.RandomState(rseed)
    x = rand.randn(N)
    x[int(f * N):] += 5
    return x

X = make_data(1000)

Gaussian Kernel with fixed bandwidth

In [82]:
from sklearn.neighbors import KernelDensity
kde = KernelDensity(kernel='gaussian',bandwidth=0.1).fit(X.reshape(-1, 1)) # train KDE
In [83]:
X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
log_density = kde.score_samples(X_plot) # calculate the log density from trained KDE
In [85]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots();
ax.plot(X_plot, np.exp(log_density));
hist = ax.hist(X, bins=30, normed=True) # histogram probability function plot
ax.scatter(X, np.zeros(1000)-0.01, marker = '+', color='r'); # scatter plot of sample points
In [86]:
density, bins, patches = hist
In [87]:
# area under histogram plot is 1
density, bins, patches
widths = bins[1:] - bins[:-1]
(density * widths).sum()
Out[87]:
1.0

Bandwidth selection

  • reference rules, statistics
    • estimate from theoretical forms based on assumptions about the data distribution
  • cross validation
    • the model is fit to part of the data, and then a quantitative metric is computed to determine how well this model fits the remaining data, can be used regardless of the underlying data distribution
    • more trustworthy

Kernel

* Gaussian kernel, Tophat kernel, Epanechnikov kernel, Exponential kernel, Linear kernel, Cosine kernel
  • by maximizes log probability density
In [90]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

bandwidths = 10 ** np.linspace(-1, 1, 100)
grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=LeaveOneOut())
grid.fit(X[:, None]);

Plot with the choosen bandwith

  • Optimized bandwidth will be 0.36 when cv = 10
In [91]:
grid.best_params_
Out[91]:
{'bandwidth': 0.35111917342151316}
In [95]:
kde = grid.best_estimator_
In [96]:
log_density = kde.score_samples(X_plot)
In [97]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots();
ax.plot(X_plot, np.exp(log_density));
hist = ax.hist(X, bins=30, normed=True) # histogram probability function plot
ax.scatter(X, np.zeros(1000)-0.01, marker = '+', color='r'); # scatter plot of sample points

Features

Two-Dimensional Kernel Density Estimation

In [115]:
m1 = np.random.normal(size=1000)
m2 = np.random.normal(scale=0.5, size=1000)
In [118]:
X = np.vstack([m1.ravel(), m2.ravel()]).T
Out[118]:
(1000, 2)
In [120]:
bandwidths = 10 ** np.linspace(-1, 1, 100)
grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=LeaveOneOut())
grid.fit(X)
Out[120]:
GridSearchCV(cv=LeaveOneOut(), error_score='raise-deprecating',
             estimator=KernelDensity(algorithm='auto', atol=0, bandwidth=1.0,
                                     breadth_first=True, kernel='gaussian',
                                     leaf_size=40, metric='euclidean',
                                     metric_params=None, rtol=0),
             iid='warn', n_jobs=None,
             param_grid={'bandwidth': array([ 0.1       ,  0.10476158,  0.10974988,  0.1149757 ,  0.12045035,
        0.12618569,  0.13219411,  0.13848...
        3.27454916,  3.43046929,  3.59381366,  3.76493581,  3.94420606,
        4.1320124 ,  4.32876128,  4.53487851,  4.75081016,  4.97702356,
        5.21400829,  5.46227722,  5.72236766,  5.9948425 ,  6.28029144,
        6.57933225,  6.8926121 ,  7.22080902,  7.56463328,  7.92482898,
        8.30217568,  8.69749003,  9.11162756,  9.54548457, 10.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [121]:
X_test = np.array([[-3, 1], [0, 0]])
In [125]:
kde = grid.best_estimator_
log_density = kde.score_samples(X_test)
np.exp(log_density)
Out[125]:
array([1.36570995e-04, 3.10088881e-01])
In [128]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots();
ax.scatter(m1, m2, marker = '+', color='r'); # scatter plot of sample points
ax.scatter(X_test[:, 0], X_test[:, 1], marker = 'o', color='b')
Out[128]:
<matplotlib.collections.PathCollection at 0x1a1d0a02b0>

Digit

  • learn a generative model for a dataset
  • Draw new samples with this generative model
In [98]:
from sklearn.datasets import load_digits
digits = load_digits()
In [99]:
digits.data.shape # 8*8 digit
Out[99]:
(1797, 64)
In [100]:
from sklearn.decomposition import PCA
pca = PCA(n_components=15, whiten=False)
data = pca.fit_transform(digits.data)
In [101]:
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(data)
Out[101]:
GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KernelDensity(algorithm='auto', atol=0, bandwidth=1.0,
                                     breadth_first=True, kernel='gaussian',
                                     leaf_size=40, metric='euclidean',
                                     metric_params=None, rtol=0),
             iid='warn', n_jobs=None,
             param_grid={'bandwidth': array([ 0.1       ,  0.1274275 ,  0.16237767,  0.20691381,  0.26366509,
        0.33598183,  0.42813324,  0.54555948,  0.6951928 ,  0.88586679,
        1.12883789,  1.43844989,  1.83298071,  2.33572147,  2.97635144,
        3.79269019,  4.83293024,  6.15848211,  7.8475997 , 10.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
In [103]:
kde = grid.best_estimator_
In [104]:
new_data = kde.sample(44, random_state=0)
new_data = pca.inverse_transform(new_data)
new_data.shape
Out[104]:
(44, 64)
In [105]:
fig, ax = plt.subplots();
ax.imshow(new_data[1, :].reshape(8, 8))
Out[105]:
<matplotlib.image.AxesImage at 0x1a20c3f080>

Species Distributions

In [106]:
from sklearn.datasets import fetch_species_distributions
data = fetch_species_distributions()
In [107]:
species_names = ['Bradypus Variegatus', 'Microryzomys Minutus']
Xtrain = np.vstack([data['train']['dd lat'], data['train']['dd long']]).T
ytrain = np.array([d.decode('ascii').startswith('micro') for d in data['train']['species']], dtype='int')
Xtrain *= np.pi / 180.
In [108]:
def construct_grids(batch):
    """Construct the map grid from the batch object

    Parameters
    ----------
    batch : Batch object
        The object returned by :func:`fetch_species_distributions`

    Returns
    -------
    (xgrid, ygrid) : 1-D arrays
        The grid corresponding to the values in batch.coverages
    """
    # x,y coordinates for corner cells
    xmin = batch.x_left_lower_corner + batch.grid_size
    xmax = xmin + (batch.Nx * batch.grid_size)
    ymin = batch.y_left_lower_corner + batch.grid_size
    ymax = ymin + (batch.Ny * batch.grid_size)

    # x coordinates of the grid cells
    xgrid = np.arange(xmin, xmax, batch.grid_size)
    # y coordinates of the grid cells
    ygrid = np.arange(ymin, ymax, batch.grid_size)

    return (xgrid, ygrid)
In [109]:
xgrid, ygrid = construct_grids(data)
X, Y = np.meshgrid(xgrid[::5], ygrid[::5][::-1])
land_reference = data.coverages[6][::5, ::5]
land_mask = (land_reference > -9999).ravel()
In [110]:
xy = np.vstack([Y.ravel(), X.ravel()]).T
xy = xy[land_mask]
xy *= np.pi / 180.
In [111]:
kde = KernelDensity(bandwidth=0.04, metric='haversine', kernel='gaussian', algorithm='ball_tree')
kde.fit(Xtrain[ytrain == 0])
Out[111]:
KernelDensity(algorithm='ball_tree', atol=0, bandwidth=0.04, breadth_first=True,
              kernel='gaussian', leaf_size=40, metric='haversine',
              metric_params=None, rtol=0)
In [112]:
Z = np.full(land_mask.shape[0], -9999, dtype='int')
Z[land_mask] = np.exp(kde.score_samples(xy))
Z = Z.reshape(X.shape)
In [113]:
levels = np.linspace(0, Z.max(), 25)
plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds)
plt.contour(X, Y, land_reference, levels=[-9998], colors="k", linestyles="solid")
Out[113]:
<matplotlib.contour.QuadContourSet at 0x1a1928e860>